#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 合成训练数据:把「靶子」贴到随机背景上,并自动生成标注(无需手工标注)。 前置条件(推荐): - 靶子用带透明通道的 PNG(抠图后),脚本按非透明像素算紧贴 bbox; - 若只有矩形靶图无 alpha,可用整张图作为矩形框贴入(略松)。 输出(默认 Pascal VOC,适配 MaixCam 等平台): - images/xxx.jpg - xml/xxx.xml(与图片同名;单目标或多目标时可扩展) - 生成张数不超过 --max-images(默认 3000) 可选 YOLO: - labels/xxx.txt(class cx cy w h,相对 0~1) 多三角形检测(Pascal VOC 多 ,适配 YOLOv5 转 VOC 训练): - 提供 --triangles-json,顶点在与 --fg 一致的原始靶图像素坐标系下; - 脚本先按 alpha 外接框裁切靶图,顶点会自动减去裁切偏移; - 透视变换时同步变换顶点,每张图输出多个三角形框; - 默认标注为顶点轴对齐最小外接矩形;可选 --triangle-bbox-pad-frac 四周加比例余量(与推理 margin 对齐)。 Stage2 ROI(对齐「先检整靶再裁小图」的第二步输入): - --stage2-crop:在合成+增强后,按靶子外接框四周随机 padding 裁剪,标注改到裁剪图坐标系; - 有 --triangles-json 时默认要求裁剪后三角形数与 JSON 一致,否则丢弃重采样(可用 --stage2-allow-partial)。 运动模糊(模拟手持/快门,默认约一半样本会施加;标注仍为几何真值,与真机域更接近): - --motion-prob:施加概率;--motion-kernel-min/max:模糊 streak 长度(奇数核,越大越糊)。 - 可与 --blur-max 高斯模糊叠加;Stage2 建议:--motion-prob 0.5~0.7 --motion-kernel-max 35 --blur-max 1.2 依赖:OpenCV + NumPy(PC 上跑即可;Maix 上若内存够也可试)。 示例: python test/synth_compose_yolo.py --bg-dir ./bg --fg ./target_cutout.png --out ./synth_out --num 3000 python test/synth_compose_yolo.py ... --triangles-json test/archery_triangles_default.json --class-name triangle --stage2-crop python test/synth_compose_yolo.py ... --zip ./dataset_voc.zip python test/synth_compose_yolo.py ... --format yolo --out ./synth_yolo """ from __future__ import annotations import argparse import json import os import random import sys import zipfile import xml.etree.ElementTree as ET import numpy as np def _collect_images(folder: str, exts=(".jpg", ".jpeg", ".png", ".bmp")): out = [] for name in sorted(os.listdir(folder)): low = name.lower() if low.endswith(exts): out.append(os.path.join(folder, name)) return out def _load_triangles_json(path: str) -> list[list[tuple[float, float]]]: with open(path, encoding="utf-8") as f: data = json.load(f) tris = data.get("triangles") if not isinstance(tris, list) or not tris: raise ValueError(f'JSON 需包含非空 "triangles" 数组: {path}') out: list[list[tuple[float, float]]] = [] for t in tris: if not isinstance(t, list) or len(t) != 3: raise ValueError(f"每个三角形需 3 个顶点: {t!r}") pts = [] for p in t: if not isinstance(p, (list, tuple)) or len(p) != 2: raise ValueError(f"顶点需为 [x,y]: {p!r}") pts.append((float(p[0]), float(p[1]))) out.append(pts) return out def _warp_triangle_points( corners_fg_orig: list[tuple[float, float]], fx0: float, fy0: float, fw0: float, fh0: float, new_w: int, new_h: int, persp_M, px: int, py: int, np, cv2, ) -> np.ndarray: """原始靶图像素坐标下的三角形顶点 -> 合成图上的 (3,2) float32。""" pts = np.array(corners_fg_orig, dtype=np.float32) pts[:, 0] -= fx0 pts[:, 1] -= fy0 pts[:, 0] *= new_w / max(fw0, 1e-6) pts[:, 1] *= new_h / max(fh0, 1e-6) if persp_M is not None: pts = cv2.perspectiveTransform(pts.reshape(1, -1, 2), persp_M).reshape(-1, 2) pts[:, 0] += px pts[:, 1] += py return pts def _triangle_xyxy_exclusive( pts_xy: np.ndarray, img_w: int, img_h: int ) -> tuple[int, int, int, int] | None: xs = pts_xy[:, 0] ys = pts_xy[:, 1] bx0 = max(0, min(img_w - 1, int(np.floor(float(xs.min()))))) by0 = max(0, min(img_h - 1, int(np.floor(float(ys.min()))))) bx1 = max(bx0 + 1, min(img_w, int(np.ceil(float(xs.max()))))) by1 = max(by0 + 1, min(img_h, int(np.ceil(float(ys.max()))))) if bx1 <= bx0 or by1 <= by0: return None return bx0, by0, bx1, by1 def _expand_xyxy_half_open( bx0: int, by0: int, bx1: int, by1: int, img_w: int, img_h: int, pad_frac: float, ) -> tuple[int, int, int, int] | None: """在半开框 [bx0,bx1)×[by0,by1) 四周按 max(宽,高)×pad_frac 对称扩展,并裁入图像。""" if pad_frac <= 1e-9: return bx0, by0, bx1, by1 bw = max(1, bx1 - bx0) bh = max(1, by1 - by0) base = float(max(bw, bh)) p = float(pad_frac) * base x0 = int(np.floor(float(bx0) - p)) y0 = int(np.floor(float(by0) - p)) x1 = int(np.ceil(float(bx1) + p)) y1 = int(np.ceil(float(by1) + p)) iw, ih = max(1, img_w), max(1, img_h) x0 = max(0, min(x0, iw - 1)) y0 = max(0, min(y0, ih - 1)) x1 = max(x0 + 1, min(x1, iw)) y1 = max(y0 + 1, min(y1, ih)) if x1 <= x0 or y1 <= y0: return None return x0, y0, x1, y1 def _stage2_crop_window( tx0: int, ty0: int, tx1: int, ty1: int, img_w: int, img_h: int, pad_min_frac: float, pad_max_frac: float, rng: random.Random, ) -> tuple[int, int, int, int] | None: """ 以靶子轴对齐框 [tx0,tx1)×[ty0,ty1)(半开)为中心,四周加随机 padding(相对 max(宽,高) 的比例), 再限制在图像内。返回 (cx0, cy0, cw, ch) 用于 comp[cy0:cy0+ch, cx0:cx0+cw]。 """ iw, ih = max(1, img_w), max(1, img_h) tw = max(1, tx1 - tx0) th = max(1, ty1 - ty0) base = float(max(tw, th)) p0 = max(0.0, float(pad_min_frac)) p1 = max(p0, float(pad_max_frac)) pad = rng.uniform(p0, p1) * base cx0 = int(np.floor(float(tx0) - pad)) cy0 = int(np.floor(float(ty0) - pad)) cx1 = int(np.ceil(float(tx1) + pad)) cy1 = int(np.ceil(float(ty1) + pad)) cx0 = max(0, min(cx0, iw - 1)) cy0 = max(0, min(cy0, ih - 1)) cx1 = max(cx0 + 1, min(cx1, iw)) cy1 = max(cy0 + 1, min(cy1, ih)) cw, ch = cx1 - cx0, cy1 - cy0 if cw < 4 or ch < 4: return None return cx0, cy0, cw, ch def _triangle_to_voc_tuple( pts_xy: np.ndarray, img_w: int, img_h: int, class_name: str, bbox_pad_frac: float = 0.0, ) -> tuple | None: """ 返回 (VOC 元组, 半开 xyxy);半开框与 VOC 一致地经 pad 扩展,供 YOLO 行写入。 bbox_pad_frac>0 时在紧三角形 AABB 四周加 max(宽,高)×frac 余量(truncated 仍按顶点是否贴边)。 """ xyxy = _triangle_xyxy_exclusive(pts_xy, img_w, img_h) if xyxy is None: return None bx0, by0, bx1, by1 = xyxy if bbox_pad_frac > 1e-9: exp = _expand_xyxy_half_open( bx0, by0, bx1, by1, img_w, img_h, bbox_pad_frac ) if exp is None: return None bx0, by0, bx1, by1 = exp xs = pts_xy[:, 0] ys = pts_xy[:, 1] truncated = ( "1" if ( xs.min() < -1e-3 or xs.max() >= img_w - 1e-3 or ys.min() < -1e-3 or ys.max() >= img_h - 1e-3 ) else "0" ) vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive( bx0, by0, bx1, by1, img_w, img_h ) if vx1 < vx0 or vy1 < vy0: return None voc = (class_name, vx0, vy0, vx1, vy1, truncated) return voc, (bx0, by0, bx1, by1) def _fg_bbox_from_alpha(fg_bgra): """非透明区域的外接矩形 (x,y,w,h),BGRA。""" import numpy as np if fg_bgra.shape[2] < 4: h, w = fg_bgra.shape[:2] return 0, 0, w, h a = fg_bgra[:, :, 3] ys, xs = np.where(a > 10) if len(xs) == 0: h, w = fg_bgra.shape[:2] return 0, 0, w, h x0, x1 = int(xs.min()), int(xs.max()) y0, y1 = int(ys.min()), int(ys.max()) return x0, y0, x1 - x0 + 1, y1 - y0 + 1 def _paste_fg_on_bg(bg_bgr, x, y, fg_scaled_bgra): """左上角 (x,y) 将 fg_scaled_bgra(BGRA)贴到 bg_bgr,就地改 bg。""" import numpy as np fh, fw = fg_scaled_bgra.shape[:2] bh, bw = bg_bgr.shape[:2] x0, y0 = max(0, x), max(0, y) x1, y1 = min(bw, x + fw), min(bh, y + fh) if x0 >= x1 or y0 >= y1: return fx0, fy0 = x0 - x, y0 - y fx1, fy1 = fx0 + (x1 - x0), fy0 + (y1 - y0) roi_bg = bg_bgr[y0:y1, x0:x1] roi_fg = fg_scaled_bgra[fy0:fy1, fx0:fx1] a = roi_fg[:, :, 3:4].astype(np.float32) / 255.0 fg_rgb = roi_fg[:, :, :3].astype(np.float32) bg_rgb = roi_bg.astype(np.float32) blended = fg_rgb * a + bg_rgb * (1.0 - a) roi_bg[:] = blended.astype(np.uint8) def _perspective_warp_rgba(img_bgra, jitter_frac: float, rng: random.Random, np, cv2): """ 对前景做轻微透视(四角微移),返回 (warped BGRA, M)。 M 为 3×3,将透视前图像平面上的点映射到 warped 图像像素坐标;未应用透视时返回 (copy, None)。 jitter_frac:扰动幅度约为 min(w,h) 的比例。 """ h, w = img_bgra.shape[:2] if jitter_frac <= 0 or min(w, h) < 16: return img_bgra.copy(), None j = float(max(1.5, min(w, h) * jitter_frac)) def dj(): return rng.uniform(-j, j) pts_src = np.float32([[0, 0], [w, 0], [w, h], [0, h]]) pts_dst = np.float32( [ [dj(), dj()], [w + dj(), dj()], [w + dj(), h + dj()], [dj(), h + dj()], ] ) xmin = float(pts_dst[:, 0].min()) ymin = float(pts_dst[:, 1].min()) pts_shift = pts_dst.copy() pts_shift[:, 0] -= xmin pts_shift[:, 1] -= ymin out_w = max(4, int(np.ceil(float(pts_shift[:, 0].max()))) + 2) out_h = max(4, int(np.ceil(float(pts_shift[:, 1].max()))) + 2) M = cv2.getPerspectiveTransform(pts_src, pts_shift) warped = cv2.warpPerspective( img_bgra, M, (out_w, out_h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=(0, 0, 0, 0), ) return warped, M def _color_jitter_bgr(comp_bgr, strength: float, rng: random.Random, np, cv2): """整图 HSV 抖动:strength∈[0,1] 越大越强。""" if strength <= 1e-6: return comp_bgr strength = min(1.0, max(0.0, strength)) hsv = cv2.cvtColor(comp_bgr, cv2.COLOR_BGR2HSV).astype(np.float32) dh = rng.uniform(-18.0 * strength, 18.0 * strength) hsv[:, :, 0] = (hsv[:, :, 0] + dh) % 180.0 sf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength) vf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength) hsv[:, :, 1] = np.clip(hsv[:, :, 1] * sf, 0, 255) hsv[:, :, 2] = np.clip(hsv[:, :, 2] * vf, 0, 255) # 轻微 BGR 通道偏置(模拟白平衡) out = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR).astype(np.float32) bias = np.array( [ rng.uniform(-12 * strength, 12 * strength), rng.uniform(-12 * strength, 12 * strength), rng.uniform(-12 * strength, 12 * strength), ], dtype=np.float32, ) out = np.clip(out + bias, 0, 255).astype(np.uint8) return out def _motion_blur_bgr( comp_bgr, rng: random.Random, k_min: int, k_max: int, np, cv2, ): """ 方向随机的线性运动模糊(filter2D)。核为奇数 k×k,沿穿过中心、角度 uniform[0,180°) 的线段归一化求和。 标注无需改:bbox 仍为物体真实位置,与真实相机「糊图+真框」的训练惯例一致。 """ lo = int(max(3, k_min | 1)) hi = int(max(lo, k_max | 1)) k = rng.randint(lo, hi) if k % 2 == 0: k = min(hi, k + 1) k = max(3, k) ker_u = np.zeros((k, k), dtype=np.uint8) ang = rng.uniform(0.0, 180.0) rad = float(np.deg2rad(ang)) c = k // 2 dx = float(np.cos(rad) * (k // 2)) dy = float(np.sin(rad) * (k // 2)) x0 = int(round(c - dx)) y0 = int(round(c - dy)) x1 = int(round(c + dx)) y1 = int(round(c + dy)) cv2.line(ker_u, (x0, y0), (x1, y1), 255, 1) s = float(ker_u.sum()) if s < 1e-3: ker_u[c, c] = 255 s = 255.0 ker = ker_u.astype(np.float32) / s return cv2.filter2D(comp_bgr, -1, ker) def _yolo_line(cls: int, xyxy_on_bg, img_w: int, img_h: int) -> str: x0, y0, x1, y1 = xyxy_on_bg bw, bh = x1 - x0, y1 - y0 cx = (x0 + x1) / 2.0 / img_w cy = (y0 + y1) / 2.0 / img_h nw = bw / img_w nh = bh / img_h cx = max(0.0, min(1.0, cx)) cy = max(0.0, min(1.0, cy)) nw = max(1e-6, min(1.0, nw)) nh = max(1e-6, min(1.0, nh)) return f"{cls} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}\n" def _xyxy_exclusive_to_voc_inclusive( x0: float, y0: float, x1: float, y1: float, img_w: int, img_h: int ) -> tuple[int, int, int, int]: """内部 xyxy 为半开区间 [x0,x1)×[y0,y1),转为 VOC inclusive 整数像素框。""" iw, ih = max(1, img_w), max(1, img_h) xi0 = max(0, min(iw - 1, int(x0))) yi0 = max(0, min(ih - 1, int(y0))) xi1 = max(xi0, min(iw - 1, int(x1) - 1)) yi1 = max(yi0, min(ih - 1, int(y1) - 1)) return xi0, yi0, xi1, yi1 def _write_pascal_voc_xml( xml_path: str, img_filename: str, img_folder: str, img_w: int, img_h: int, depth: int, objects: list[tuple], ) -> None: """ objects 每项为 (class_name, xmin, ymin, xmax, ymax) 或 (class_name, xmin, ymin, xmax, ymax, truncated),坐标均为 inclusive 整数像素; truncated 为 \"0\" 或 \"1\"(省略时默认为 \"0\")。 """ root = ET.Element("annotation") ET.SubElement(root, "folder").text = img_folder ET.SubElement(root, "filename").text = img_filename src = ET.SubElement(root, "source") ET.SubElement(src, "database").text = "synthetic_archery" ET.SubElement(src, "annotation").text = "Pascal VOC compatible" sz = ET.SubElement(root, "size") ET.SubElement(sz, "width").text = str(img_w) ET.SubElement(sz, "height").text = str(img_h) ET.SubElement(sz, "depth").text = str(depth) ET.SubElement(root, "segmented").text = "0" for item in objects: if len(item) == 6: name, xmin, ymin, xmax, ymax, truncated = item else: name, xmin, ymin, xmax, ymax = item truncated = "0" obj = ET.SubElement(root, "object") ET.SubElement(obj, "name").text = name ET.SubElement(obj, "pose").text = "Unspecified" ET.SubElement(obj, "truncated").text = str(truncated) ET.SubElement(obj, "difficult").text = "0" bb = ET.SubElement(obj, "bndbox") ET.SubElement(bb, "xmin").text = str(xmin) ET.SubElement(bb, "ymin").text = str(ymin) ET.SubElement(bb, "xmax").text = str(xmax) ET.SubElement(bb, "ymax").text = str(ymax) tree = ET.ElementTree(root) try: ET.indent(tree, space=" ") except AttributeError: pass tree.write(xml_path, encoding="utf-8", xml_declaration=True) def _zip_images_xml(dataset_root: str, zip_path: str) -> None: """打包 dataset_root 下的 images/ 与 xml/ 到 zip(根目录含这两个文件夹)。""" img_dir = os.path.join(dataset_root, "images") xml_dir = os.path.join(dataset_root, "xml") if not os.path.isdir(img_dir) or not os.path.isdir(xml_dir): raise FileNotFoundError(f"需要存在目录: {img_dir} 与 {xml_dir}") zip_path = os.path.abspath(zip_path) os.makedirs(os.path.dirname(zip_path) or ".", exist_ok=True) with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: for folder, arc_prefix in ((img_dir, "images"), (xml_dir, "xml")): for name in sorted(os.listdir(folder)): fp = os.path.join(folder, name) if os.path.isfile(fp): zf.write(fp, arcname=os.path.join(arc_prefix, name).replace("\\", "/")) def main(): ap = argparse.ArgumentParser() ap.add_argument("--bg-dir", required=True, help="背景图目录") ap.add_argument("--fg", required=True, help="靶子 PNG(推荐 RGBA 抠图)或任意图") ap.add_argument("--out", default="./synth_dataset", help="输出根目录") ap.add_argument("--num", type=int, default=200, help="请求生成张数(实际不超过 --max-images)") ap.add_argument( "--max-images", type=int, default=3000, help="最多生成图片张数,超出部分忽略(MaixCam 等平台常见上限 3000)", ) ap.add_argument( "--format", choices=("voc", "yolo", "both"), default="voc", help="voc=Pascal VOC(images+xml);yolo=labels txt;both=两者都写", ) ap.add_argument( "--class-name", default="黑三角和圆环", help="VOC 类别名(单类检测默认 target)", ) ap.add_argument("--class-id", type=int, default=0, help="YOLO 类别 id(仅 --format yolo/both)") ap.add_argument( "--zip", default=None, metavar="PATH", help="完成后将 images/ 与 xml/ 打成 zip(仅 VOC/both 时有 xml;路径如 ./dataset.zip)", ) ap.add_argument("--seed", type=int, default=None) ap.add_argument("--scale-min", type=float, default=0.15, help="靶子最短边占背景最短边比例下限") ap.add_argument("--scale-max", type=float, default=0.55, help="比例上限") ap.add_argument("--blur-max", type=float, default=0.0, help="高斯模糊 sigma 上限,0 关闭") ap.add_argument( "--motion-prob", type=float, default=0.45, help="运动模糊概率 0~1(默认约一半样本;关模糊用 0)", ) ap.add_argument( "--motion-kernel-min", type=int, default=7, help="运动模糊 streak 长度下限(奇数,实际会纠到奇数)", ) ap.add_argument( "--motion-kernel-max", type=int, default=35, help="运动模糊 streak 长度上限,越大越像长曝光/手抖", ) ap.add_argument("--jpeg-quality", type=int, default=92) ap.add_argument( "--perspective", type=float, default=0.0, help="轻微透视:四角扰动约为 min(靶宽,靶高)×该系数,0 关闭(建议 0.02~0.06)", ) ap.add_argument( "--perspective-prob", type=float, default=0.75, help="每张图应用透视的概率 0~1", ) ap.add_argument( "--color-jitter", type=float, default=0.0, help="合成后整图颜色抖动强度 0~1,0 关闭(建议 0.4~0.8)", ) ap.add_argument( "--triangles-json", default=None, metavar="PATH", help="三角形顶点 JSON(test/archery_triangles_default.json);坐标与 --fg 原图一致," "多三角形时每张图写多个 VOC (透视时顶点同步变换)", ) ap.add_argument( "--triangle-bbox-pad-frac", type=float, default=0.0, help="三角形检测框在紧 AABB 四周再加 max(宽,高)×该比例(VOC/YOLO 同步);" "0=贴顶点外接框;Stage2 建议 0.08~0.18,与推理端 margin 接近更易对齐", ) ap.add_argument( "--stage2-crop", action="store_true", help="合成与增强后按靶子外接框+随机边距裁剪,输出与 Stage2(整靶 ROI)构图一致;标注为裁剪后坐标", ) ap.add_argument( "--stage2-pad-min", type=float, default=0.02, help="Stage2 裁剪:四边 padding 相对靶 max(宽,高) 的比例下限", ) ap.add_argument( "--stage2-pad-max", type=float, default=0.14, help="Stage2 裁剪:padding 比例上限", ) ap.add_argument( "--stage2-allow-partial", action="store_true", help="有 --triangles-json 时允许裁剪后有效三角形数少于 JSON(默认要求数量一致)", ) args = ap.parse_args() try: import cv2 import numpy as np except ImportError: print("[ERR] 需要 opencv-python、numpy") sys.exit(1) rng = random.Random(args.seed) bgs = _collect_images(args.bg_dir) if not bgs: print(f"[ERR] 背景目录无图片: {args.bg_dir}") sys.exit(1) fg_path = args.fg if not os.path.isfile(fg_path): print(f"[ERR] 找不到靶图: {fg_path}") sys.exit(1) fg = cv2.imread(fg_path, cv2.IMREAD_UNCHANGED) if fg is None: print(f"[ERR] 无法读取靶图: {fg_path}") sys.exit(1) if fg.ndim == 2: fg = cv2.cvtColor(fg, cv2.COLOR_GRAY2BGRA) elif fg.shape[2] == 3: b, g, r = cv2.split(fg) a = np.full_like(b, 255) fg = cv2.merge([b, g, r, a]) fx0, fy0, fw0, fh0 = _fg_bbox_from_alpha(fg) fg_crop = fg[fy0 : fy0 + fh0, fx0 : fx0 + fw0].copy() triangles_full = None if args.triangles_json: tpath = args.triangles_json if not os.path.isfile(tpath): print(f"[ERR] 找不到 --triangles-json: {tpath}") sys.exit(1) try: triangles_full = _load_triangles_json(tpath) except (json.JSONDecodeError, ValueError, OSError) as e: print(f"[ERR] 解析三角形 JSON 失败: {e}") sys.exit(1) print(f"[INFO] 已加载 {len(triangles_full)} 个三角形(每张图多个 VOC 检测框)") want_voc = args.format in ("voc", "both") want_yolo = args.format in ("yolo", "both") n_gen = min(max(0, args.num), max(0, args.max_images)) if args.num > args.max_images: print(f"[INFO] --num={args.num} 大于 --max-images={args.max_images},仅生成 {n_gen} 张") if args.stage2_crop: print( f"[INFO] Stage2 裁剪: pad∈[{args.stage2_pad_min},{args.stage2_pad_max}]×max(靶宽,靶高)," f"partial={'允许' if args.stage2_allow_partial else '不允许'}" ) out_img = os.path.join(args.out, "images") out_xml = os.path.join(args.out, "xml") out_lbl = os.path.join(args.out, "labels") os.makedirs(out_img, exist_ok=True) if want_voc: os.makedirs(out_xml, exist_ok=True) if want_yolo: os.makedirs(out_lbl, exist_ok=True) print(f"[INFO] 背景 {len(bgs)} 张,格式={args.format},生成 {n_gen} 张 → {args.out}") i_done = 0 while i_done < n_gen: bg_path = rng.choice(bgs) bg = cv2.imread(bg_path, cv2.IMREAD_COLOR) if bg is None: continue bh, bw = bg.shape[:2] short_bg = min(bh, bw) short_fg = min(fh0, fw0) smin = args.scale_min * short_bg / max(short_fg, 1) smax = args.scale_max * short_bg / max(short_fg, 1) scale = rng.uniform(max(smin, 0.05), max(smax, smin + 0.01)) new_w = max(4, int(fw0 * scale)) new_h = max(4, int(fh0 * scale)) fg_s = cv2.resize(fg_crop, (new_w, new_h), interpolation=cv2.INTER_AREA) persp_M = None if args.perspective > 0 and rng.random() < args.perspective_prob: fg_s, persp_M = _perspective_warp_rgba(fg_s, args.perspective, rng, np, cv2) fw2, fh2 = fg_s.shape[1], fg_s.shape[0] tx0, ty0, tw, th = _fg_bbox_from_alpha(fg_s) max_x = max(0, bw - fw2) max_y = max(0, bh - fh2) px = rng.randint(0, max_x) if max_x > 0 else 0 py = rng.randint(0, max_y) if max_y > 0 else 0 comp = bg.copy() _paste_fg_on_bg(comp, px, py, fg_s) # 标注:整靶 alpha 框(无 triangles-json 时使用)或多三角形框 bx0 = px + tx0 by0 = py + ty0 bx1 = px + tx0 + tw by1 = py + ty0 + th bx0 = max(0, min(bx0, bw - 1)) by0 = max(0, min(by0, bh - 1)) bx1 = max(bx0 + 1, min(bx1, bw)) by1 = max(by0 + 1, min(by1, bh)) tri_pts_full: list[np.ndarray] = [] if triangles_full is not None: for tri in triangles_full: pts_c = _warp_triangle_points( tri, float(fx0), float(fy0), float(fw0), float(fh0), new_w, new_h, persp_M, px, py, np, cv2, ) tri_pts_full.append(pts_c) if args.color_jitter > 1e-6: comp = _color_jitter_bgr(comp, args.color_jitter, rng, np, cv2) if args.blur_max > 1e-6: sig = rng.uniform(0.3, args.blur_max) k = int(sig * 4) | 1 comp = cv2.GaussianBlur(comp, (k, k), sig) if rng.random() < max(0.0, min(1.0, float(args.motion_prob))): comp = _motion_blur_bgr( comp, rng, args.motion_kernel_min, args.motion_kernel_max, np, cv2, ) bh, bw = comp.shape[:2] if args.stage2_crop: win = _stage2_crop_window( bx0, by0, bx1, by1, bw, bh, args.stage2_pad_min, args.stage2_pad_max, rng, ) if win is None: continue cx0, cy0, cw, ch = win comp = comp[cy0 : cy0 + ch, cx0 : cx0 + cw].copy() out_w, out_h = cw, ch if triangles_full is not None: voc_objects = [] yolo_lines_list = [] for pts_c in tri_pts_full: p2 = pts_c.copy() p2[:, 0] -= cx0 p2[:, 1] -= cy0 pair = _triangle_to_voc_tuple( p2, out_w, out_h, args.class_name, args.triangle_bbox_pad_frac, ) if pair is None: continue vo, xyxy = pair voc_objects.append(vo) if want_yolo: yolo_lines_list.append( _yolo_line(args.class_id, xyxy, out_w, out_h) ) if not args.stage2_allow_partial and len(voc_objects) != len( triangles_full ): continue if want_voc and not voc_objects: continue if want_yolo and not yolo_lines_list: continue else: nbx0, nby0 = bx0 - cx0, by0 - cy0 nbx1, nby1 = bx1 - cx0, by1 - cy0 nbx0 = max(0, min(nbx0, out_w - 1)) nby0 = max(0, min(nby0, out_h - 1)) nbx1 = max(nbx0 + 1, min(nbx1, out_w)) nby1 = max(nby0 + 1, min(nby1, out_h)) if nbx1 <= nbx0 or nby1 <= nby0: continue vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive( nbx0, nby0, nbx1, nby1, out_w, out_h ) voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)] yolo_lines_list = ( [_yolo_line(args.class_id, (nbx0, nby0, nbx1, nby1), out_w, out_h)] if want_yolo else [] ) else: out_w, out_h = bw, bh if triangles_full is not None: voc_objects = [] yolo_lines_list = [] for pts_c in tri_pts_full: pair = _triangle_to_voc_tuple( pts_c, out_w, out_h, args.class_name, args.triangle_bbox_pad_frac, ) if pair is None: continue vo, xyxy = pair voc_objects.append(vo) if want_yolo: yolo_lines_list.append( _yolo_line(args.class_id, xyxy, out_w, out_h) ) if want_voc and not voc_objects: continue if want_yolo and not yolo_lines_list: continue else: vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive( bx0, by0, bx1, by1, out_w, out_h ) voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)] yolo_lines_list = ( [_yolo_line(args.class_id, (bx0, by0, bx1, by1), out_w, out_h)] if want_yolo else [] ) stem = f"synth_{i_done:06d}" img_name = stem + ".jpg" img_path = os.path.join(out_img, img_name) cv2.imwrite(img_path, comp, [int(cv2.IMWRITE_JPEG_QUALITY), args.jpeg_quality]) if want_voc: xml_path = os.path.join(out_xml, stem + ".xml") _write_pascal_voc_xml( xml_path, img_filename=img_name, img_folder="images", img_w=out_w, img_h=out_h, depth=3, objects=voc_objects, ) if want_yolo: lbl_path = os.path.join(out_lbl, stem + ".txt") with open(lbl_path, "w", encoding="utf-8") as f: f.writelines(yolo_lines_list) i_done += 1 if i_done % 50 == 0: print(f" ... {i_done}/{n_gen}") parts = [out_img] if want_voc: parts.append(out_xml) if want_yolo: parts.append(out_lbl) print(f"[OK] 完成: " + " , ".join(parts)) if args.zip: if not want_voc: print("[WARN] --zip 需要 VOC 标注目录 xml/,当前格式未生成 xml,跳过打包") else: try: _zip_images_xml(args.out, args.zip) print(f"[OK] 已打包: {os.path.abspath(args.zip)}") except OSError as e: print(f"[ERR] 打包失败: {e}") sys.exit(1) if __name__ == "__main__": main()