Files
archery/test/synth_compose_yolo.py
2026-05-11 16:26:05 +08:00

876 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
合成训练数据:把「靶子」贴到随机背景上,并自动生成标注(无需手工标注)。
前置条件(推荐):
- 靶子用带透明通道的 PNG抠图后脚本按非透明像素算紧贴 bbox
- 若只有矩形靶图无 alpha可用整张图作为矩形框贴入略松
输出(默认 Pascal VOC适配 MaixCam 等平台):
- images/xxx.jpg
- xml/xxx.xml与图片同名单目标或多目标时可扩展
- 生成张数不超过 --max-images默认 3000
可选 YOLO
- labels/xxx.txtclass cx cy w h相对 0~1
多三角形检测Pascal VOC 多 <object>,适配 YOLOv5 转 VOC 训练):
- 提供 --triangles-json顶点在与 --fg 一致的原始靶图像素坐标系下;
- 脚本先按 alpha 外接框裁切靶图,顶点会自动减去裁切偏移;
- 透视变换时同步变换顶点,每张图输出多个三角形框;
- 默认标注为顶点轴对齐最小外接矩形;可选 --triangle-bbox-pad-frac 四周加比例余量(与推理 margin 对齐)。
Stage2 ROI对齐「先检整靶再裁小图」的第二步输入
- --stage2-crop在合成+增强后,按靶子外接框四周随机 padding 裁剪,标注改到裁剪图坐标系;
- 有 --triangles-json 时默认要求裁剪后三角形数与 JSON 一致,否则丢弃重采样(可用 --stage2-allow-partial
运动模糊(模拟手持/快门,默认约一半样本会施加;标注仍为几何真值,与真机域更接近):
- --motion-prob施加概率--motion-kernel-min/max模糊 streak 长度(奇数核,越大越糊)。
- 可与 --blur-max 高斯模糊叠加Stage2 建议:--motion-prob 0.5~0.7 --motion-kernel-max 35 --blur-max 1.2
依赖OpenCV + NumPyPC 上跑即可Maix 上若内存够也可试)。
示例:
python test/synth_compose_yolo.py --bg-dir ./bg --fg ./target_cutout.png --out ./synth_out --num 3000
python test/synth_compose_yolo.py ... --triangles-json test/archery_triangles_default.json --class-name triangle --stage2-crop
python test/synth_compose_yolo.py ... --zip ./dataset_voc.zip
python test/synth_compose_yolo.py ... --format yolo --out ./synth_yolo
"""
from __future__ import annotations
import argparse
import json
import os
import random
import sys
import zipfile
import xml.etree.ElementTree as ET
import numpy as np
def _collect_images(folder: str, exts=(".jpg", ".jpeg", ".png", ".bmp")):
out = []
for name in sorted(os.listdir(folder)):
low = name.lower()
if low.endswith(exts):
out.append(os.path.join(folder, name))
return out
def _load_triangles_json(path: str) -> list[list[tuple[float, float]]]:
with open(path, encoding="utf-8") as f:
data = json.load(f)
tris = data.get("triangles")
if not isinstance(tris, list) or not tris:
raise ValueError(f'JSON 需包含非空 "triangles" 数组: {path}')
out: list[list[tuple[float, float]]] = []
for t in tris:
if not isinstance(t, list) or len(t) != 3:
raise ValueError(f"每个三角形需 3 个顶点: {t!r}")
pts = []
for p in t:
if not isinstance(p, (list, tuple)) or len(p) != 2:
raise ValueError(f"顶点需为 [x,y]: {p!r}")
pts.append((float(p[0]), float(p[1])))
out.append(pts)
return out
def _warp_triangle_points(
corners_fg_orig: list[tuple[float, float]],
fx0: float,
fy0: float,
fw0: float,
fh0: float,
new_w: int,
new_h: int,
persp_M,
px: int,
py: int,
np,
cv2,
) -> np.ndarray:
"""原始靶图像素坐标下的三角形顶点 -> 合成图上的 (3,2) float32。"""
pts = np.array(corners_fg_orig, dtype=np.float32)
pts[:, 0] -= fx0
pts[:, 1] -= fy0
pts[:, 0] *= new_w / max(fw0, 1e-6)
pts[:, 1] *= new_h / max(fh0, 1e-6)
if persp_M is not None:
pts = cv2.perspectiveTransform(pts.reshape(1, -1, 2), persp_M).reshape(-1, 2)
pts[:, 0] += px
pts[:, 1] += py
return pts
def _triangle_xyxy_exclusive(
pts_xy: np.ndarray, img_w: int, img_h: int
) -> tuple[int, int, int, int] | None:
xs = pts_xy[:, 0]
ys = pts_xy[:, 1]
bx0 = max(0, min(img_w - 1, int(np.floor(float(xs.min())))))
by0 = max(0, min(img_h - 1, int(np.floor(float(ys.min())))))
bx1 = max(bx0 + 1, min(img_w, int(np.ceil(float(xs.max())))))
by1 = max(by0 + 1, min(img_h, int(np.ceil(float(ys.max())))))
if bx1 <= bx0 or by1 <= by0:
return None
return bx0, by0, bx1, by1
def _expand_xyxy_half_open(
bx0: int,
by0: int,
bx1: int,
by1: int,
img_w: int,
img_h: int,
pad_frac: float,
) -> tuple[int, int, int, int] | None:
"""在半开框 [bx0,bx1)×[by0,by1) 四周按 max(宽,高)×pad_frac 对称扩展,并裁入图像。"""
if pad_frac <= 1e-9:
return bx0, by0, bx1, by1
bw = max(1, bx1 - bx0)
bh = max(1, by1 - by0)
base = float(max(bw, bh))
p = float(pad_frac) * base
x0 = int(np.floor(float(bx0) - p))
y0 = int(np.floor(float(by0) - p))
x1 = int(np.ceil(float(bx1) + p))
y1 = int(np.ceil(float(by1) + p))
iw, ih = max(1, img_w), max(1, img_h)
x0 = max(0, min(x0, iw - 1))
y0 = max(0, min(y0, ih - 1))
x1 = max(x0 + 1, min(x1, iw))
y1 = max(y0 + 1, min(y1, ih))
if x1 <= x0 or y1 <= y0:
return None
return x0, y0, x1, y1
def _stage2_crop_window(
tx0: int,
ty0: int,
tx1: int,
ty1: int,
img_w: int,
img_h: int,
pad_min_frac: float,
pad_max_frac: float,
rng: random.Random,
) -> tuple[int, int, int, int] | None:
"""
以靶子轴对齐框 [tx0,tx1)×[ty0,ty1)(半开)为中心,四周加随机 padding相对 max(宽,高) 的比例),
再限制在图像内。返回 (cx0, cy0, cw, ch) 用于 comp[cy0:cy0+ch, cx0:cx0+cw]。
"""
iw, ih = max(1, img_w), max(1, img_h)
tw = max(1, tx1 - tx0)
th = max(1, ty1 - ty0)
base = float(max(tw, th))
p0 = max(0.0, float(pad_min_frac))
p1 = max(p0, float(pad_max_frac))
pad = rng.uniform(p0, p1) * base
cx0 = int(np.floor(float(tx0) - pad))
cy0 = int(np.floor(float(ty0) - pad))
cx1 = int(np.ceil(float(tx1) + pad))
cy1 = int(np.ceil(float(ty1) + pad))
cx0 = max(0, min(cx0, iw - 1))
cy0 = max(0, min(cy0, ih - 1))
cx1 = max(cx0 + 1, min(cx1, iw))
cy1 = max(cy0 + 1, min(cy1, ih))
cw, ch = cx1 - cx0, cy1 - cy0
if cw < 4 or ch < 4:
return None
return cx0, cy0, cw, ch
def _triangle_to_voc_tuple(
pts_xy: np.ndarray,
img_w: int,
img_h: int,
class_name: str,
bbox_pad_frac: float = 0.0,
) -> tuple | None:
"""
返回 (VOC 元组, 半开 xyxy);半开框与 VOC 一致地经 pad 扩展,供 YOLO 行写入。
bbox_pad_frac>0 时在紧三角形 AABB 四周加 max(宽,高)×frac 余量truncated 仍按顶点是否贴边)。
"""
xyxy = _triangle_xyxy_exclusive(pts_xy, img_w, img_h)
if xyxy is None:
return None
bx0, by0, bx1, by1 = xyxy
if bbox_pad_frac > 1e-9:
exp = _expand_xyxy_half_open(
bx0, by0, bx1, by1, img_w, img_h, bbox_pad_frac
)
if exp is None:
return None
bx0, by0, bx1, by1 = exp
xs = pts_xy[:, 0]
ys = pts_xy[:, 1]
truncated = (
"1"
if (
xs.min() < -1e-3
or xs.max() >= img_w - 1e-3
or ys.min() < -1e-3
or ys.max() >= img_h - 1e-3
)
else "0"
)
vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
bx0, by0, bx1, by1, img_w, img_h
)
if vx1 < vx0 or vy1 < vy0:
return None
voc = (class_name, vx0, vy0, vx1, vy1, truncated)
return voc, (bx0, by0, bx1, by1)
def _fg_bbox_from_alpha(fg_bgra):
"""非透明区域的外接矩形 (x,y,w,h)BGRA。"""
import numpy as np
if fg_bgra.shape[2] < 4:
h, w = fg_bgra.shape[:2]
return 0, 0, w, h
a = fg_bgra[:, :, 3]
ys, xs = np.where(a > 10)
if len(xs) == 0:
h, w = fg_bgra.shape[:2]
return 0, 0, w, h
x0, x1 = int(xs.min()), int(xs.max())
y0, y1 = int(ys.min()), int(ys.max())
return x0, y0, x1 - x0 + 1, y1 - y0 + 1
def _paste_fg_on_bg(bg_bgr, x, y, fg_scaled_bgra):
"""左上角 (x,y) 将 fg_scaled_bgraBGRA贴到 bg_bgr就地改 bg。"""
import numpy as np
fh, fw = fg_scaled_bgra.shape[:2]
bh, bw = bg_bgr.shape[:2]
x0, y0 = max(0, x), max(0, y)
x1, y1 = min(bw, x + fw), min(bh, y + fh)
if x0 >= x1 or y0 >= y1:
return
fx0, fy0 = x0 - x, y0 - y
fx1, fy1 = fx0 + (x1 - x0), fy0 + (y1 - y0)
roi_bg = bg_bgr[y0:y1, x0:x1]
roi_fg = fg_scaled_bgra[fy0:fy1, fx0:fx1]
a = roi_fg[:, :, 3:4].astype(np.float32) / 255.0
fg_rgb = roi_fg[:, :, :3].astype(np.float32)
bg_rgb = roi_bg.astype(np.float32)
blended = fg_rgb * a + bg_rgb * (1.0 - a)
roi_bg[:] = blended.astype(np.uint8)
def _perspective_warp_rgba(img_bgra, jitter_frac: float, rng: random.Random, np, cv2):
"""
对前景做轻微透视(四角微移),返回 (warped BGRA, M)。
M 为 3×3将透视前图像平面上的点映射到 warped 图像像素坐标;未应用透视时返回 (copy, None)。
jitter_frac扰动幅度约为 min(w,h) 的比例。
"""
h, w = img_bgra.shape[:2]
if jitter_frac <= 0 or min(w, h) < 16:
return img_bgra.copy(), None
j = float(max(1.5, min(w, h) * jitter_frac))
def dj():
return rng.uniform(-j, j)
pts_src = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
pts_dst = np.float32(
[
[dj(), dj()],
[w + dj(), dj()],
[w + dj(), h + dj()],
[dj(), h + dj()],
]
)
xmin = float(pts_dst[:, 0].min())
ymin = float(pts_dst[:, 1].min())
pts_shift = pts_dst.copy()
pts_shift[:, 0] -= xmin
pts_shift[:, 1] -= ymin
out_w = max(4, int(np.ceil(float(pts_shift[:, 0].max()))) + 2)
out_h = max(4, int(np.ceil(float(pts_shift[:, 1].max()))) + 2)
M = cv2.getPerspectiveTransform(pts_src, pts_shift)
warped = cv2.warpPerspective(
img_bgra,
M,
(out_w, out_h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0, 0),
)
return warped, M
def _color_jitter_bgr(comp_bgr, strength: float, rng: random.Random, np, cv2):
"""整图 HSV 抖动strength∈[0,1] 越大越强。"""
if strength <= 1e-6:
return comp_bgr
strength = min(1.0, max(0.0, strength))
hsv = cv2.cvtColor(comp_bgr, cv2.COLOR_BGR2HSV).astype(np.float32)
dh = rng.uniform(-18.0 * strength, 18.0 * strength)
hsv[:, :, 0] = (hsv[:, :, 0] + dh) % 180.0
sf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength)
vf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength)
hsv[:, :, 1] = np.clip(hsv[:, :, 1] * sf, 0, 255)
hsv[:, :, 2] = np.clip(hsv[:, :, 2] * vf, 0, 255)
# 轻微 BGR 通道偏置(模拟白平衡)
out = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR).astype(np.float32)
bias = np.array(
[
rng.uniform(-12 * strength, 12 * strength),
rng.uniform(-12 * strength, 12 * strength),
rng.uniform(-12 * strength, 12 * strength),
],
dtype=np.float32,
)
out = np.clip(out + bias, 0, 255).astype(np.uint8)
return out
def _motion_blur_bgr(
comp_bgr,
rng: random.Random,
k_min: int,
k_max: int,
np,
cv2,
):
"""
方向随机的线性运动模糊filter2D。核为奇数 k×k沿穿过中心、角度 uniform[0,180°) 的线段归一化求和。
标注无需改bbox 仍为物体真实位置,与真实相机「糊图+真框」的训练惯例一致。
"""
lo = int(max(3, k_min | 1))
hi = int(max(lo, k_max | 1))
k = rng.randint(lo, hi)
if k % 2 == 0:
k = min(hi, k + 1)
k = max(3, k)
ker_u = np.zeros((k, k), dtype=np.uint8)
ang = rng.uniform(0.0, 180.0)
rad = float(np.deg2rad(ang))
c = k // 2
dx = float(np.cos(rad) * (k // 2))
dy = float(np.sin(rad) * (k // 2))
x0 = int(round(c - dx))
y0 = int(round(c - dy))
x1 = int(round(c + dx))
y1 = int(round(c + dy))
cv2.line(ker_u, (x0, y0), (x1, y1), 255, 1)
s = float(ker_u.sum())
if s < 1e-3:
ker_u[c, c] = 255
s = 255.0
ker = ker_u.astype(np.float32) / s
return cv2.filter2D(comp_bgr, -1, ker)
def _yolo_line(cls: int, xyxy_on_bg, img_w: int, img_h: int) -> str:
x0, y0, x1, y1 = xyxy_on_bg
bw, bh = x1 - x0, y1 - y0
cx = (x0 + x1) / 2.0 / img_w
cy = (y0 + y1) / 2.0 / img_h
nw = bw / img_w
nh = bh / img_h
cx = max(0.0, min(1.0, cx))
cy = max(0.0, min(1.0, cy))
nw = max(1e-6, min(1.0, nw))
nh = max(1e-6, min(1.0, nh))
return f"{cls} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}\n"
def _xyxy_exclusive_to_voc_inclusive(
x0: float, y0: float, x1: float, y1: float, img_w: int, img_h: int
) -> tuple[int, int, int, int]:
"""内部 xyxy 为半开区间 [x0,x1)×[y0,y1),转为 VOC inclusive 整数像素框。"""
iw, ih = max(1, img_w), max(1, img_h)
xi0 = max(0, min(iw - 1, int(x0)))
yi0 = max(0, min(ih - 1, int(y0)))
xi1 = max(xi0, min(iw - 1, int(x1) - 1))
yi1 = max(yi0, min(ih - 1, int(y1) - 1))
return xi0, yi0, xi1, yi1
def _write_pascal_voc_xml(
xml_path: str,
img_filename: str,
img_folder: str,
img_w: int,
img_h: int,
depth: int,
objects: list[tuple],
) -> None:
"""
objects 每项为 (class_name, xmin, ymin, xmax, ymax) 或
(class_name, xmin, ymin, xmax, ymax, truncated),坐标均为 inclusive 整数像素;
truncated 为 \"0\"\"1\"(省略时默认为 \"0\")。
"""
root = ET.Element("annotation")
ET.SubElement(root, "folder").text = img_folder
ET.SubElement(root, "filename").text = img_filename
src = ET.SubElement(root, "source")
ET.SubElement(src, "database").text = "synthetic_archery"
ET.SubElement(src, "annotation").text = "Pascal VOC compatible"
sz = ET.SubElement(root, "size")
ET.SubElement(sz, "width").text = str(img_w)
ET.SubElement(sz, "height").text = str(img_h)
ET.SubElement(sz, "depth").text = str(depth)
ET.SubElement(root, "segmented").text = "0"
for item in objects:
if len(item) == 6:
name, xmin, ymin, xmax, ymax, truncated = item
else:
name, xmin, ymin, xmax, ymax = item
truncated = "0"
obj = ET.SubElement(root, "object")
ET.SubElement(obj, "name").text = name
ET.SubElement(obj, "pose").text = "Unspecified"
ET.SubElement(obj, "truncated").text = str(truncated)
ET.SubElement(obj, "difficult").text = "0"
bb = ET.SubElement(obj, "bndbox")
ET.SubElement(bb, "xmin").text = str(xmin)
ET.SubElement(bb, "ymin").text = str(ymin)
ET.SubElement(bb, "xmax").text = str(xmax)
ET.SubElement(bb, "ymax").text = str(ymax)
tree = ET.ElementTree(root)
try:
ET.indent(tree, space=" ")
except AttributeError:
pass
tree.write(xml_path, encoding="utf-8", xml_declaration=True)
def _zip_images_xml(dataset_root: str, zip_path: str) -> None:
"""打包 dataset_root 下的 images/ 与 xml/ 到 zip根目录含这两个文件夹"""
img_dir = os.path.join(dataset_root, "images")
xml_dir = os.path.join(dataset_root, "xml")
if not os.path.isdir(img_dir) or not os.path.isdir(xml_dir):
raise FileNotFoundError(f"需要存在目录: {img_dir}{xml_dir}")
zip_path = os.path.abspath(zip_path)
os.makedirs(os.path.dirname(zip_path) or ".", exist_ok=True)
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
for folder, arc_prefix in ((img_dir, "images"), (xml_dir, "xml")):
for name in sorted(os.listdir(folder)):
fp = os.path.join(folder, name)
if os.path.isfile(fp):
zf.write(fp, arcname=os.path.join(arc_prefix, name).replace("\\", "/"))
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--bg-dir", required=True, help="背景图目录")
ap.add_argument("--fg", required=True, help="靶子 PNG推荐 RGBA 抠图)或任意图")
ap.add_argument("--out", default="./synth_dataset", help="输出根目录")
ap.add_argument("--num", type=int, default=200, help="请求生成张数(实际不超过 --max-images")
ap.add_argument(
"--max-images",
type=int,
default=3000,
help="最多生成图片张数超出部分忽略MaixCam 等平台常见上限 3000",
)
ap.add_argument(
"--format",
choices=("voc", "yolo", "both"),
default="voc",
help="voc=Pascal VOCimages+xmlyolo=labels txtboth=两者都写",
)
ap.add_argument(
"--class-name",
default="黑三角和圆环",
help="VOC <object><name> 类别名(单类检测默认 target",
)
ap.add_argument("--class-id", type=int, default=0, help="YOLO 类别 id仅 --format yolo/both")
ap.add_argument(
"--zip",
default=None,
metavar="PATH",
help="完成后将 images/ 与 xml/ 打成 zip仅 VOC/both 时有 xml路径如 ./dataset.zip",
)
ap.add_argument("--seed", type=int, default=None)
ap.add_argument("--scale-min", type=float, default=0.15, help="靶子最短边占背景最短边比例下限")
ap.add_argument("--scale-max", type=float, default=0.55, help="比例上限")
ap.add_argument("--blur-max", type=float, default=0.0, help="高斯模糊 sigma 上限0 关闭")
ap.add_argument(
"--motion-prob",
type=float,
default=0.45,
help="运动模糊概率 0~1默认约一半样本关模糊用 0",
)
ap.add_argument(
"--motion-kernel-min",
type=int,
default=7,
help="运动模糊 streak 长度下限(奇数,实际会纠到奇数)",
)
ap.add_argument(
"--motion-kernel-max",
type=int,
default=35,
help="运动模糊 streak 长度上限,越大越像长曝光/手抖",
)
ap.add_argument("--jpeg-quality", type=int, default=92)
ap.add_argument(
"--perspective",
type=float,
default=0.0,
help="轻微透视:四角扰动约为 min(靶宽,靶高)×该系数0 关闭(建议 0.02~0.06",
)
ap.add_argument(
"--perspective-prob",
type=float,
default=0.75,
help="每张图应用透视的概率 0~1",
)
ap.add_argument(
"--color-jitter",
type=float,
default=0.0,
help="合成后整图颜色抖动强度 0~10 关闭(建议 0.4~0.8",
)
ap.add_argument(
"--triangles-json",
default=None,
metavar="PATH",
help="三角形顶点 JSONtest/archery_triangles_default.json坐标与 --fg 原图一致,"
"多三角形时每张图写多个 VOC <object>(透视时顶点同步变换)",
)
ap.add_argument(
"--triangle-bbox-pad-frac",
type=float,
default=0.0,
help="三角形检测框在紧 AABB 四周再加 max(宽,高)×该比例VOC/YOLO 同步);"
"0=贴顶点外接框Stage2 建议 0.08~0.18,与推理端 margin 接近更易对齐",
)
ap.add_argument(
"--stage2-crop",
action="store_true",
help="合成与增强后按靶子外接框+随机边距裁剪,输出与 Stage2整靶 ROI构图一致标注为裁剪后坐标",
)
ap.add_argument(
"--stage2-pad-min",
type=float,
default=0.02,
help="Stage2 裁剪:四边 padding 相对靶 max(宽,高) 的比例下限",
)
ap.add_argument(
"--stage2-pad-max",
type=float,
default=0.14,
help="Stage2 裁剪padding 比例上限",
)
ap.add_argument(
"--stage2-allow-partial",
action="store_true",
help="有 --triangles-json 时允许裁剪后有效三角形数少于 JSON默认要求数量一致",
)
args = ap.parse_args()
try:
import cv2
import numpy as np
except ImportError:
print("[ERR] 需要 opencv-python、numpy")
sys.exit(1)
rng = random.Random(args.seed)
bgs = _collect_images(args.bg_dir)
if not bgs:
print(f"[ERR] 背景目录无图片: {args.bg_dir}")
sys.exit(1)
fg_path = args.fg
if not os.path.isfile(fg_path):
print(f"[ERR] 找不到靶图: {fg_path}")
sys.exit(1)
fg = cv2.imread(fg_path, cv2.IMREAD_UNCHANGED)
if fg is None:
print(f"[ERR] 无法读取靶图: {fg_path}")
sys.exit(1)
if fg.ndim == 2:
fg = cv2.cvtColor(fg, cv2.COLOR_GRAY2BGRA)
elif fg.shape[2] == 3:
b, g, r = cv2.split(fg)
a = np.full_like(b, 255)
fg = cv2.merge([b, g, r, a])
fx0, fy0, fw0, fh0 = _fg_bbox_from_alpha(fg)
fg_crop = fg[fy0 : fy0 + fh0, fx0 : fx0 + fw0].copy()
triangles_full = None
if args.triangles_json:
tpath = args.triangles_json
if not os.path.isfile(tpath):
print(f"[ERR] 找不到 --triangles-json: {tpath}")
sys.exit(1)
try:
triangles_full = _load_triangles_json(tpath)
except (json.JSONDecodeError, ValueError, OSError) as e:
print(f"[ERR] 解析三角形 JSON 失败: {e}")
sys.exit(1)
print(f"[INFO] 已加载 {len(triangles_full)} 个三角形(每张图多个 VOC 检测框)")
want_voc = args.format in ("voc", "both")
want_yolo = args.format in ("yolo", "both")
n_gen = min(max(0, args.num), max(0, args.max_images))
if args.num > args.max_images:
print(f"[INFO] --num={args.num} 大于 --max-images={args.max_images},仅生成 {n_gen}")
if args.stage2_crop:
print(
f"[INFO] Stage2 裁剪: pad∈[{args.stage2_pad_min},{args.stage2_pad_max}]×max(靶宽,靶高)"
f"partial={'允许' if args.stage2_allow_partial else '不允许'}"
)
out_img = os.path.join(args.out, "images")
out_xml = os.path.join(args.out, "xml")
out_lbl = os.path.join(args.out, "labels")
os.makedirs(out_img, exist_ok=True)
if want_voc:
os.makedirs(out_xml, exist_ok=True)
if want_yolo:
os.makedirs(out_lbl, exist_ok=True)
print(f"[INFO] 背景 {len(bgs)} 张,格式={args.format},生成 {n_gen} 张 → {args.out}")
i_done = 0
while i_done < n_gen:
bg_path = rng.choice(bgs)
bg = cv2.imread(bg_path, cv2.IMREAD_COLOR)
if bg is None:
continue
bh, bw = bg.shape[:2]
short_bg = min(bh, bw)
short_fg = min(fh0, fw0)
smin = args.scale_min * short_bg / max(short_fg, 1)
smax = args.scale_max * short_bg / max(short_fg, 1)
scale = rng.uniform(max(smin, 0.05), max(smax, smin + 0.01))
new_w = max(4, int(fw0 * scale))
new_h = max(4, int(fh0 * scale))
fg_s = cv2.resize(fg_crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
persp_M = None
if args.perspective > 0 and rng.random() < args.perspective_prob:
fg_s, persp_M = _perspective_warp_rgba(fg_s, args.perspective, rng, np, cv2)
fw2, fh2 = fg_s.shape[1], fg_s.shape[0]
tx0, ty0, tw, th = _fg_bbox_from_alpha(fg_s)
max_x = max(0, bw - fw2)
max_y = max(0, bh - fh2)
px = rng.randint(0, max_x) if max_x > 0 else 0
py = rng.randint(0, max_y) if max_y > 0 else 0
comp = bg.copy()
_paste_fg_on_bg(comp, px, py, fg_s)
# 标注:整靶 alpha 框(无 triangles-json 时使用)或多三角形框
bx0 = px + tx0
by0 = py + ty0
bx1 = px + tx0 + tw
by1 = py + ty0 + th
bx0 = max(0, min(bx0, bw - 1))
by0 = max(0, min(by0, bh - 1))
bx1 = max(bx0 + 1, min(bx1, bw))
by1 = max(by0 + 1, min(by1, bh))
tri_pts_full: list[np.ndarray] = []
if triangles_full is not None:
for tri in triangles_full:
pts_c = _warp_triangle_points(
tri,
float(fx0),
float(fy0),
float(fw0),
float(fh0),
new_w,
new_h,
persp_M,
px,
py,
np,
cv2,
)
tri_pts_full.append(pts_c)
if args.color_jitter > 1e-6:
comp = _color_jitter_bgr(comp, args.color_jitter, rng, np, cv2)
if args.blur_max > 1e-6:
sig = rng.uniform(0.3, args.blur_max)
k = int(sig * 4) | 1
comp = cv2.GaussianBlur(comp, (k, k), sig)
if rng.random() < max(0.0, min(1.0, float(args.motion_prob))):
comp = _motion_blur_bgr(
comp,
rng,
args.motion_kernel_min,
args.motion_kernel_max,
np,
cv2,
)
bh, bw = comp.shape[:2]
if args.stage2_crop:
win = _stage2_crop_window(
bx0,
by0,
bx1,
by1,
bw,
bh,
args.stage2_pad_min,
args.stage2_pad_max,
rng,
)
if win is None:
continue
cx0, cy0, cw, ch = win
comp = comp[cy0 : cy0 + ch, cx0 : cx0 + cw].copy()
out_w, out_h = cw, ch
if triangles_full is not None:
voc_objects = []
yolo_lines_list = []
for pts_c in tri_pts_full:
p2 = pts_c.copy()
p2[:, 0] -= cx0
p2[:, 1] -= cy0
pair = _triangle_to_voc_tuple(
p2,
out_w,
out_h,
args.class_name,
args.triangle_bbox_pad_frac,
)
if pair is None:
continue
vo, xyxy = pair
voc_objects.append(vo)
if want_yolo:
yolo_lines_list.append(
_yolo_line(args.class_id, xyxy, out_w, out_h)
)
if not args.stage2_allow_partial and len(voc_objects) != len(
triangles_full
):
continue
if want_voc and not voc_objects:
continue
if want_yolo and not yolo_lines_list:
continue
else:
nbx0, nby0 = bx0 - cx0, by0 - cy0
nbx1, nby1 = bx1 - cx0, by1 - cy0
nbx0 = max(0, min(nbx0, out_w - 1))
nby0 = max(0, min(nby0, out_h - 1))
nbx1 = max(nbx0 + 1, min(nbx1, out_w))
nby1 = max(nby0 + 1, min(nby1, out_h))
if nbx1 <= nbx0 or nby1 <= nby0:
continue
vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
nbx0, nby0, nbx1, nby1, out_w, out_h
)
voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)]
yolo_lines_list = (
[_yolo_line(args.class_id, (nbx0, nby0, nbx1, nby1), out_w, out_h)]
if want_yolo
else []
)
else:
out_w, out_h = bw, bh
if triangles_full is not None:
voc_objects = []
yolo_lines_list = []
for pts_c in tri_pts_full:
pair = _triangle_to_voc_tuple(
pts_c,
out_w,
out_h,
args.class_name,
args.triangle_bbox_pad_frac,
)
if pair is None:
continue
vo, xyxy = pair
voc_objects.append(vo)
if want_yolo:
yolo_lines_list.append(
_yolo_line(args.class_id, xyxy, out_w, out_h)
)
if want_voc and not voc_objects:
continue
if want_yolo and not yolo_lines_list:
continue
else:
vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
bx0, by0, bx1, by1, out_w, out_h
)
voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)]
yolo_lines_list = (
[_yolo_line(args.class_id, (bx0, by0, bx1, by1), out_w, out_h)]
if want_yolo
else []
)
stem = f"synth_{i_done:06d}"
img_name = stem + ".jpg"
img_path = os.path.join(out_img, img_name)
cv2.imwrite(img_path, comp, [int(cv2.IMWRITE_JPEG_QUALITY), args.jpeg_quality])
if want_voc:
xml_path = os.path.join(out_xml, stem + ".xml")
_write_pascal_voc_xml(
xml_path,
img_filename=img_name,
img_folder="images",
img_w=out_w,
img_h=out_h,
depth=3,
objects=voc_objects,
)
if want_yolo:
lbl_path = os.path.join(out_lbl, stem + ".txt")
with open(lbl_path, "w", encoding="utf-8") as f:
f.writelines(yolo_lines_list)
i_done += 1
if i_done % 50 == 0:
print(f" ... {i_done}/{n_gen}")
parts = [out_img]
if want_voc:
parts.append(out_xml)
if want_yolo:
parts.append(out_lbl)
print(f"[OK] 完成: " + " , ".join(parts))
if args.zip:
if not want_voc:
print("[WARN] --zip 需要 VOC 标注目录 xml/,当前格式未生成 xml跳过打包")
else:
try:
_zip_images_xml(args.out, args.zip)
print(f"[OK] 已打包: {os.path.abspath(args.zip)}")
except OSError as e:
print(f"[ERR] 打包失败: {e}")
sys.exit(1)
if __name__ == "__main__":
main()