archery/test/synth_compose_yolo.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
合成训练数据：把「靶子」贴到随机背景上，并自动生成标注（无需手工标注）。

前置条件（推荐）：
  - 靶子用带透明通道的 PNG（抠图后），脚本按非透明像素算紧贴 bbox；
  - 若只有矩形靶图无 alpha，可用整张图作为矩形框贴入（略松）。

输出（默认 Pascal VOC，适配 MaixCam 等平台）：
  - images/xxx.jpg
  - xml/xxx.xml（与图片同名；单目标或多目标时可扩展）
  - 生成张数不超过 --max-images（默认 3000）
可选 YOLO：
  - labels/xxx.txt（class cx cy w h，相对 0~1）

多三角形检测（Pascal VOC 多 <object>，适配 YOLOv5 转 VOC 训练）：
  - 提供 --triangles-json，顶点在与 --fg 一致的原始靶图像素坐标系下；
  - 脚本先按 alpha 外接框裁切靶图，顶点会自动减去裁切偏移；
  - 透视变换时同步变换顶点，每张图输出多个三角形框；
  - 默认标注为顶点轴对齐最小外接矩形；可选 --triangle-bbox-pad-frac 四周加比例余量（与推理 margin 对齐）。

Stage2 ROI（对齐「先检整靶再裁小图」的第二步输入）：
  - --stage2-crop：在合成+增强后，按靶子外接框四周随机 padding 裁剪，标注改到裁剪图坐标系；
  - 有 --triangles-json 时默认要求裁剪后三角形数与 JSON 一致，否则丢弃重采样（可用 --stage2-allow-partial）。

运动模糊（模拟手持/快门，默认约一半样本会施加；标注仍为几何真值，与真机域更接近）：
  - --motion-prob：施加概率；--motion-kernel-min/max：模糊 streak 长度（奇数核，越大越糊）。
  - 可与 --blur-max 高斯模糊叠加；Stage2 建议：--motion-prob 0.5~0.7 --motion-kernel-max 35 --blur-max 1.2

依赖：OpenCV + NumPy（PC 上跑即可；Maix 上若内存够也可试）。

示例：
  python test/synth_compose_yolo.py --bg-dir ./bg --fg ./target_cutout.png --out ./synth_out --num 3000
  python test/synth_compose_yolo.py ... --triangles-json test/archery_triangles_default.json --class-name triangle --stage2-crop
  python test/synth_compose_yolo.py ... --zip ./dataset_voc.zip
  python test/synth_compose_yolo.py ... --format yolo --out ./synth_yolo
"""

from __future__ import annotations

import argparse
import json
import os
import random
import sys
import zipfile
import xml.etree.ElementTree as ET

import numpy as np


def _collect_images(folder: str, exts=(".jpg", ".jpeg", ".png", ".bmp")):
    out = []
    for name in sorted(os.listdir(folder)):
        low = name.lower()
        if low.endswith(exts):
            out.append(os.path.join(folder, name))
    return out


def _load_triangles_json(path: str) -> list[list[tuple[float, float]]]:
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
    tris = data.get("triangles")
    if not isinstance(tris, list) or not tris:
        raise ValueError(f'JSON 需包含非空 "triangles" 数组: {path}')
    out: list[list[tuple[float, float]]] = []
    for t in tris:
        if not isinstance(t, list) or len(t) != 3:
            raise ValueError(f"每个三角形需 3 个顶点: {t!r}")
        pts = []
        for p in t:
            if not isinstance(p, (list, tuple)) or len(p) != 2:
                raise ValueError(f"顶点需为 [x,y]: {p!r}")
            pts.append((float(p[0]), float(p[1])))
        out.append(pts)
    return out


def _warp_triangle_points(
    corners_fg_orig: list[tuple[float, float]],
    fx0: float,
    fy0: float,
    fw0: float,
    fh0: float,
    new_w: int,
    new_h: int,
    persp_M,
    px: int,
    py: int,
    np,
    cv2,
) -> np.ndarray:
    """原始靶图像素坐标下的三角形顶点 -> 合成图上的 (3,2) float32。"""
    pts = np.array(corners_fg_orig, dtype=np.float32)
    pts[:, 0] -= fx0
    pts[:, 1] -= fy0
    pts[:, 0] *= new_w / max(fw0, 1e-6)
    pts[:, 1] *= new_h / max(fh0, 1e-6)
    if persp_M is not None:
        pts = cv2.perspectiveTransform(pts.reshape(1, -1, 2), persp_M).reshape(-1, 2)
    pts[:, 0] += px
    pts[:, 1] += py
    return pts


def _triangle_xyxy_exclusive(
    pts_xy: np.ndarray, img_w: int, img_h: int
) -> tuple[int, int, int, int] | None:
    xs = pts_xy[:, 0]
    ys = pts_xy[:, 1]
    bx0 = max(0, min(img_w - 1, int(np.floor(float(xs.min())))))
    by0 = max(0, min(img_h - 1, int(np.floor(float(ys.min())))))
    bx1 = max(bx0 + 1, min(img_w, int(np.ceil(float(xs.max())))))
    by1 = max(by0 + 1, min(img_h, int(np.ceil(float(ys.max())))))
    if bx1 <= bx0 or by1 <= by0:
        return None
    return bx0, by0, bx1, by1


def _expand_xyxy_half_open(
    bx0: int,
    by0: int,
    bx1: int,
    by1: int,
    img_w: int,
    img_h: int,
    pad_frac: float,
) -> tuple[int, int, int, int] | None:
    """在半开框 [bx0,bx1)×[by0,by1) 四周按 max(宽,高)×pad_frac 对称扩展，并裁入图像。"""
    if pad_frac <= 1e-9:
        return bx0, by0, bx1, by1
    bw = max(1, bx1 - bx0)
    bh = max(1, by1 - by0)
    base = float(max(bw, bh))
    p = float(pad_frac) * base
    x0 = int(np.floor(float(bx0) - p))
    y0 = int(np.floor(float(by0) - p))
    x1 = int(np.ceil(float(bx1) + p))
    y1 = int(np.ceil(float(by1) + p))
    iw, ih = max(1, img_w), max(1, img_h)
    x0 = max(0, min(x0, iw - 1))
    y0 = max(0, min(y0, ih - 1))
    x1 = max(x0 + 1, min(x1, iw))
    y1 = max(y0 + 1, min(y1, ih))
    if x1 <= x0 or y1 <= y0:
        return None
    return x0, y0, x1, y1


def _stage2_crop_window(
    tx0: int,
    ty0: int,
    tx1: int,
    ty1: int,
    img_w: int,
    img_h: int,
    pad_min_frac: float,
    pad_max_frac: float,
    rng: random.Random,
) -> tuple[int, int, int, int] | None:
    """
    以靶子轴对齐框 [tx0,tx1)×[ty0,ty1)（半开）为中心，四周加随机 padding（相对 max(宽,高) 的比例），
    再限制在图像内。返回 (cx0, cy0, cw, ch) 用于 comp[cy0:cy0+ch, cx0:cx0+cw]。
    """
    iw, ih = max(1, img_w), max(1, img_h)
    tw = max(1, tx1 - tx0)
    th = max(1, ty1 - ty0)
    base = float(max(tw, th))
    p0 = max(0.0, float(pad_min_frac))
    p1 = max(p0, float(pad_max_frac))
    pad = rng.uniform(p0, p1) * base
    cx0 = int(np.floor(float(tx0) - pad))
    cy0 = int(np.floor(float(ty0) - pad))
    cx1 = int(np.ceil(float(tx1) + pad))
    cy1 = int(np.ceil(float(ty1) + pad))
    cx0 = max(0, min(cx0, iw - 1))
    cy0 = max(0, min(cy0, ih - 1))
    cx1 = max(cx0 + 1, min(cx1, iw))
    cy1 = max(cy0 + 1, min(cy1, ih))
    cw, ch = cx1 - cx0, cy1 - cy0
    if cw < 4 or ch < 4:
        return None
    return cx0, cy0, cw, ch


def _triangle_to_voc_tuple(
    pts_xy: np.ndarray,
    img_w: int,
    img_h: int,
    class_name: str,
    bbox_pad_frac: float = 0.0,
) -> tuple | None:
    """
    返回 (VOC 元组, 半开 xyxy)；半开框与 VOC 一致地经 pad 扩展，供 YOLO 行写入。
    bbox_pad_frac>0 时在紧三角形 AABB 四周加 max(宽,高)×frac 余量（truncated 仍按顶点是否贴边）。
    """
    xyxy = _triangle_xyxy_exclusive(pts_xy, img_w, img_h)
    if xyxy is None:
        return None
    bx0, by0, bx1, by1 = xyxy
    if bbox_pad_frac > 1e-9:
        exp = _expand_xyxy_half_open(
            bx0, by0, bx1, by1, img_w, img_h, bbox_pad_frac
        )
        if exp is None:
            return None
        bx0, by0, bx1, by1 = exp
    xs = pts_xy[:, 0]
    ys = pts_xy[:, 1]
    truncated = (
        "1"
        if (
            xs.min() < -1e-3
            or xs.max() >= img_w - 1e-3
            or ys.min() < -1e-3
            or ys.max() >= img_h - 1e-3
        )
        else "0"
    )
    vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
        bx0, by0, bx1, by1, img_w, img_h
    )
    if vx1 < vx0 or vy1 < vy0:
        return None
    voc = (class_name, vx0, vy0, vx1, vy1, truncated)
    return voc, (bx0, by0, bx1, by1)


def _fg_bbox_from_alpha(fg_bgra):
    """非透明区域的外接矩形 (x,y,w,h)，BGRA。"""
    import numpy as np

    if fg_bgra.shape[2] < 4:
        h, w = fg_bgra.shape[:2]
        return 0, 0, w, h
    a = fg_bgra[:, :, 3]
    ys, xs = np.where(a > 10)
    if len(xs) == 0:
        h, w = fg_bgra.shape[:2]
        return 0, 0, w, h
    x0, x1 = int(xs.min()), int(xs.max())
    y0, y1 = int(ys.min()), int(ys.max())
    return x0, y0, x1 - x0 + 1, y1 - y0 + 1


def _paste_fg_on_bg(bg_bgr, x, y, fg_scaled_bgra):
    """左上角 (x,y) 将 fg_scaled_bgra（BGRA）贴到 bg_bgr，就地改 bg。"""
    import numpy as np

    fh, fw = fg_scaled_bgra.shape[:2]
    bh, bw = bg_bgr.shape[:2]
    x0, y0 = max(0, x), max(0, y)
    x1, y1 = min(bw, x + fw), min(bh, y + fh)
    if x0 >= x1 or y0 >= y1:
        return
    fx0, fy0 = x0 - x, y0 - y
    fx1, fy1 = fx0 + (x1 - x0), fy0 + (y1 - y0)
    roi_bg = bg_bgr[y0:y1, x0:x1]
    roi_fg = fg_scaled_bgra[fy0:fy1, fx0:fx1]
    a = roi_fg[:, :, 3:4].astype(np.float32) / 255.0
    fg_rgb = roi_fg[:, :, :3].astype(np.float32)
    bg_rgb = roi_bg.astype(np.float32)
    blended = fg_rgb * a + bg_rgb * (1.0 - a)
    roi_bg[:] = blended.astype(np.uint8)


def _perspective_warp_rgba(img_bgra, jitter_frac: float, rng: random.Random, np, cv2):
    """
    对前景做轻微透视（四角微移），返回 (warped BGRA, M)。
    M 为 3×3，将透视前图像平面上的点映射到 warped 图像像素坐标；未应用透视时返回 (copy, None)。
    jitter_frac：扰动幅度约为 min(w,h) 的比例。
    """
    h, w = img_bgra.shape[:2]
    if jitter_frac <= 0 or min(w, h) < 16:
        return img_bgra.copy(), None

    j = float(max(1.5, min(w, h) * jitter_frac))

    def dj():
        return rng.uniform(-j, j)

    pts_src = np.float32([[0, 0], [w, 0], [w, h], [0, h]])
    pts_dst = np.float32(
        [
            [dj(), dj()],
            [w + dj(), dj()],
            [w + dj(), h + dj()],
            [dj(), h + dj()],
        ]
    )

    xmin = float(pts_dst[:, 0].min())
    ymin = float(pts_dst[:, 1].min())
    pts_shift = pts_dst.copy()
    pts_shift[:, 0] -= xmin
    pts_shift[:, 1] -= ymin
    out_w = max(4, int(np.ceil(float(pts_shift[:, 0].max()))) + 2)
    out_h = max(4, int(np.ceil(float(pts_shift[:, 1].max()))) + 2)

    M = cv2.getPerspectiveTransform(pts_src, pts_shift)
    warped = cv2.warpPerspective(
        img_bgra,
        M,
        (out_w, out_h),
        flags=cv2.INTER_LINEAR,
        borderMode=cv2.BORDER_CONSTANT,
        borderValue=(0, 0, 0, 0),
    )
    return warped, M


def _color_jitter_bgr(comp_bgr, strength: float, rng: random.Random, np, cv2):
    """整图 HSV 抖动：strength∈[0,1] 越大越强。"""
    if strength <= 1e-6:
        return comp_bgr
    strength = min(1.0, max(0.0, strength))
    hsv = cv2.cvtColor(comp_bgr, cv2.COLOR_BGR2HSV).astype(np.float32)
    dh = rng.uniform(-18.0 * strength, 18.0 * strength)
    hsv[:, :, 0] = (hsv[:, :, 0] + dh) % 180.0
    sf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength)
    vf = rng.uniform(1.0 - 0.22 * strength, 1.0 + 0.22 * strength)
    hsv[:, :, 1] = np.clip(hsv[:, :, 1] * sf, 0, 255)
    hsv[:, :, 2] = np.clip(hsv[:, :, 2] * vf, 0, 255)
    # 轻微 BGR 通道偏置（模拟白平衡）
    out = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2BGR).astype(np.float32)
    bias = np.array(
        [
            rng.uniform(-12 * strength, 12 * strength),
            rng.uniform(-12 * strength, 12 * strength),
            rng.uniform(-12 * strength, 12 * strength),
        ],
        dtype=np.float32,
    )
    out = np.clip(out + bias, 0, 255).astype(np.uint8)
    return out


def _motion_blur_bgr(
    comp_bgr,
    rng: random.Random,
    k_min: int,
    k_max: int,
    np,
    cv2,
):
    """
    方向随机的线性运动模糊（filter2D）。核为奇数 k×k，沿穿过中心、角度 uniform[0,180°) 的线段归一化求和。
    标注无需改：bbox 仍为物体真实位置，与真实相机「糊图+真框」的训练惯例一致。
    """
    lo = int(max(3, k_min | 1))
    hi = int(max(lo, k_max | 1))
    k = rng.randint(lo, hi)
    if k % 2 == 0:
        k = min(hi, k + 1)
    k = max(3, k)
    ker_u = np.zeros((k, k), dtype=np.uint8)
    ang = rng.uniform(0.0, 180.0)
    rad = float(np.deg2rad(ang))
    c = k // 2
    dx = float(np.cos(rad) * (k // 2))
    dy = float(np.sin(rad) * (k // 2))
    x0 = int(round(c - dx))
    y0 = int(round(c - dy))
    x1 = int(round(c + dx))
    y1 = int(round(c + dy))
    cv2.line(ker_u, (x0, y0), (x1, y1), 255, 1)
    s = float(ker_u.sum())
    if s < 1e-3:
        ker_u[c, c] = 255
        s = 255.0
    ker = ker_u.astype(np.float32) / s
    return cv2.filter2D(comp_bgr, -1, ker)


def _yolo_line(cls: int, xyxy_on_bg, img_w: int, img_h: int) -> str:
    x0, y0, x1, y1 = xyxy_on_bg
    bw, bh = x1 - x0, y1 - y0
    cx = (x0 + x1) / 2.0 / img_w
    cy = (y0 + y1) / 2.0 / img_h
    nw = bw / img_w
    nh = bh / img_h
    cx = max(0.0, min(1.0, cx))
    cy = max(0.0, min(1.0, cy))
    nw = max(1e-6, min(1.0, nw))
    nh = max(1e-6, min(1.0, nh))
    return f"{cls} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}\n"


def _xyxy_exclusive_to_voc_inclusive(
    x0: float, y0: float, x1: float, y1: float, img_w: int, img_h: int
) -> tuple[int, int, int, int]:
    """内部 xyxy 为半开区间 [x0,x1)×[y0,y1)，转为 VOC  inclusive 整数像素框。"""
    iw, ih = max(1, img_w), max(1, img_h)
    xi0 = max(0, min(iw - 1, int(x0)))
    yi0 = max(0, min(ih - 1, int(y0)))
    xi1 = max(xi0, min(iw - 1, int(x1) - 1))
    yi1 = max(yi0, min(ih - 1, int(y1) - 1))
    return xi0, yi0, xi1, yi1


def _write_pascal_voc_xml(
    xml_path: str,
    img_filename: str,
    img_folder: str,
    img_w: int,
    img_h: int,
    depth: int,
    objects: list[tuple],
) -> None:
    """
    objects 每项为 (class_name, xmin, ymin, xmax, ymax) 或
    (class_name, xmin, ymin, xmax, ymax, truncated)，坐标均为 inclusive 整数像素；
    truncated 为 \"0\" 或 \"1\"（省略时默认为 \"0\"）。
    """
    root = ET.Element("annotation")
    ET.SubElement(root, "folder").text = img_folder
    ET.SubElement(root, "filename").text = img_filename
    src = ET.SubElement(root, "source")
    ET.SubElement(src, "database").text = "synthetic_archery"
    ET.SubElement(src, "annotation").text = "Pascal VOC compatible"
    sz = ET.SubElement(root, "size")
    ET.SubElement(sz, "width").text = str(img_w)
    ET.SubElement(sz, "height").text = str(img_h)
    ET.SubElement(sz, "depth").text = str(depth)
    ET.SubElement(root, "segmented").text = "0"
    for item in objects:
        if len(item) == 6:
            name, xmin, ymin, xmax, ymax, truncated = item
        else:
            name, xmin, ymin, xmax, ymax = item
            truncated = "0"
        obj = ET.SubElement(root, "object")
        ET.SubElement(obj, "name").text = name
        ET.SubElement(obj, "pose").text = "Unspecified"
        ET.SubElement(obj, "truncated").text = str(truncated)
        ET.SubElement(obj, "difficult").text = "0"
        bb = ET.SubElement(obj, "bndbox")
        ET.SubElement(bb, "xmin").text = str(xmin)
        ET.SubElement(bb, "ymin").text = str(ymin)
        ET.SubElement(bb, "xmax").text = str(xmax)
        ET.SubElement(bb, "ymax").text = str(ymax)

    tree = ET.ElementTree(root)
    try:
        ET.indent(tree, space="  ")
    except AttributeError:
        pass
    tree.write(xml_path, encoding="utf-8", xml_declaration=True)


def _zip_images_xml(dataset_root: str, zip_path: str) -> None:
    """打包 dataset_root 下的 images/ 与 xml/ 到 zip（根目录含这两个文件夹）。"""
    img_dir = os.path.join(dataset_root, "images")
    xml_dir = os.path.join(dataset_root, "xml")
    if not os.path.isdir(img_dir) or not os.path.isdir(xml_dir):
        raise FileNotFoundError(f"需要存在目录: {img_dir} 与 {xml_dir}")
    zip_path = os.path.abspath(zip_path)
    os.makedirs(os.path.dirname(zip_path) or ".", exist_ok=True)
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for folder, arc_prefix in ((img_dir, "images"), (xml_dir, "xml")):
            for name in sorted(os.listdir(folder)):
                fp = os.path.join(folder, name)
                if os.path.isfile(fp):
                    zf.write(fp, arcname=os.path.join(arc_prefix, name).replace("\\", "/"))


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--bg-dir", required=True, help="背景图目录")
    ap.add_argument("--fg", required=True, help="靶子 PNG（推荐 RGBA 抠图）或任意图")
    ap.add_argument("--out", default="./synth_dataset", help="输出根目录")
    ap.add_argument("--num", type=int, default=200, help="请求生成张数（实际不超过 --max-images）")
    ap.add_argument(
        "--max-images",
        type=int,
        default=3000,
        help="最多生成图片张数，超出部分忽略（MaixCam 等平台常见上限 3000）",
    )
    ap.add_argument(
        "--format",
        choices=("voc", "yolo", "both"),
        default="voc",
        help="voc=Pascal VOC（images+xml）；yolo=labels txt；both=两者都写",
    )
    ap.add_argument(
        "--class-name",
        default="黑三角和圆环",
        help="VOC <object><name> 类别名（单类检测默认 target）",
    )
    ap.add_argument("--class-id", type=int, default=0, help="YOLO 类别 id（仅 --format yolo/both）")
    ap.add_argument(
        "--zip",
        default=None,
        metavar="PATH",
        help="完成后将 images/ 与 xml/ 打成 zip（仅 VOC/both 时有 xml；路径如 ./dataset.zip）",
    )
    ap.add_argument("--seed", type=int, default=None)
    ap.add_argument("--scale-min", type=float, default=0.15, help="靶子最短边占背景最短边比例下限")
    ap.add_argument("--scale-max", type=float, default=0.55, help="比例上限")
    ap.add_argument("--blur-max", type=float, default=0.0, help="高斯模糊 sigma 上限，0 关闭")
    ap.add_argument(
        "--motion-prob",
        type=float,
        default=0.45,
        help="运动模糊概率 0~1（默认约一半样本；关模糊用 0）",
    )
    ap.add_argument(
        "--motion-kernel-min",
        type=int,
        default=7,
        help="运动模糊 streak 长度下限（奇数，实际会纠到奇数）",
    )
    ap.add_argument(
        "--motion-kernel-max",
        type=int,
        default=35,
        help="运动模糊 streak 长度上限，越大越像长曝光/手抖",
    )
    ap.add_argument("--jpeg-quality", type=int, default=92)
    ap.add_argument(
        "--perspective",
        type=float,
        default=0.0,
        help="轻微透视：四角扰动约为 min(靶宽,靶高)×该系数，0 关闭（建议 0.02~0.06）",
    )
    ap.add_argument(
        "--perspective-prob",
        type=float,
        default=0.75,
        help="每张图应用透视的概率 0~1",
    )
    ap.add_argument(
        "--color-jitter",
        type=float,
        default=0.0,
        help="合成后整图颜色抖动强度 0~1，0 关闭（建议 0.4~0.8）",
    )
    ap.add_argument(
        "--triangles-json",
        default=None,
        metavar="PATH",
        help="三角形顶点 JSON（test/archery_triangles_default.json）；坐标与 --fg 原图一致，"
        "多三角形时每张图写多个 VOC <object>（透视时顶点同步变换）",
    )
    ap.add_argument(
        "--triangle-bbox-pad-frac",
        type=float,
        default=0.0,
        help="三角形检测框在紧 AABB 四周再加 max(宽,高)×该比例（VOC/YOLO 同步）；"
        "0=贴顶点外接框；Stage2 建议 0.08~0.18，与推理端 margin 接近更易对齐",
    )
    ap.add_argument(
        "--stage2-crop",
        action="store_true",
        help="合成与增强后按靶子外接框+随机边距裁剪，输出与 Stage2（整靶 ROI）构图一致；标注为裁剪后坐标",
    )
    ap.add_argument(
        "--stage2-pad-min",
        type=float,
        default=0.02,
        help="Stage2 裁剪：四边 padding 相对靶 max(宽,高) 的比例下限",
    )
    ap.add_argument(
        "--stage2-pad-max",
        type=float,
        default=0.14,
        help="Stage2 裁剪：padding 比例上限",
    )
    ap.add_argument(
        "--stage2-allow-partial",
        action="store_true",
        help="有 --triangles-json 时允许裁剪后有效三角形数少于 JSON（默认要求数量一致）",
    )
    args = ap.parse_args()

    try:
        import cv2
        import numpy as np
    except ImportError:
        print("[ERR] 需要 opencv-python、numpy")
        sys.exit(1)

    rng = random.Random(args.seed)

    bgs = _collect_images(args.bg_dir)
    if not bgs:
        print(f"[ERR] 背景目录无图片: {args.bg_dir}")
        sys.exit(1)

    fg_path = args.fg
    if not os.path.isfile(fg_path):
        print(f"[ERR] 找不到靶图: {fg_path}")
        sys.exit(1)

    fg = cv2.imread(fg_path, cv2.IMREAD_UNCHANGED)
    if fg is None:
        print(f"[ERR] 无法读取靶图: {fg_path}")
        sys.exit(1)
    if fg.ndim == 2:
        fg = cv2.cvtColor(fg, cv2.COLOR_GRAY2BGRA)
    elif fg.shape[2] == 3:
        b, g, r = cv2.split(fg)
        a = np.full_like(b, 255)
        fg = cv2.merge([b, g, r, a])

    fx0, fy0, fw0, fh0 = _fg_bbox_from_alpha(fg)
    fg_crop = fg[fy0 : fy0 + fh0, fx0 : fx0 + fw0].copy()

    triangles_full = None
    if args.triangles_json:
        tpath = args.triangles_json
        if not os.path.isfile(tpath):
            print(f"[ERR] 找不到 --triangles-json: {tpath}")
            sys.exit(1)
        try:
            triangles_full = _load_triangles_json(tpath)
        except (json.JSONDecodeError, ValueError, OSError) as e:
            print(f"[ERR] 解析三角形 JSON 失败: {e}")
            sys.exit(1)
        print(f"[INFO] 已加载 {len(triangles_full)} 个三角形（每张图多个 VOC 检测框）")

    want_voc = args.format in ("voc", "both")
    want_yolo = args.format in ("yolo", "both")
    n_gen = min(max(0, args.num), max(0, args.max_images))
    if args.num > args.max_images:
        print(f"[INFO] --num={args.num} 大于 --max-images={args.max_images}，仅生成 {n_gen} 张")

    if args.stage2_crop:
        print(
            f"[INFO] Stage2 裁剪: pad∈[{args.stage2_pad_min},{args.stage2_pad_max}]×max(靶宽,靶高)，"
            f"partial={'允许' if args.stage2_allow_partial else '不允许'}"
        )

    out_img = os.path.join(args.out, "images")
    out_xml = os.path.join(args.out, "xml")
    out_lbl = os.path.join(args.out, "labels")
    os.makedirs(out_img, exist_ok=True)
    if want_voc:
        os.makedirs(out_xml, exist_ok=True)
    if want_yolo:
        os.makedirs(out_lbl, exist_ok=True)

    print(f"[INFO] 背景 {len(bgs)} 张，格式={args.format}，生成 {n_gen} 张 → {args.out}")

    i_done = 0
    while i_done < n_gen:
        bg_path = rng.choice(bgs)
        bg = cv2.imread(bg_path, cv2.IMREAD_COLOR)
        if bg is None:
            continue
        bh, bw = bg.shape[:2]
        short_bg = min(bh, bw)
        short_fg = min(fh0, fw0)
        smin = args.scale_min * short_bg / max(short_fg, 1)
        smax = args.scale_max * short_bg / max(short_fg, 1)
        scale = rng.uniform(max(smin, 0.05), max(smax, smin + 0.01))

        new_w = max(4, int(fw0 * scale))
        new_h = max(4, int(fh0 * scale))
        fg_s = cv2.resize(fg_crop, (new_w, new_h), interpolation=cv2.INTER_AREA)

        persp_M = None
        if args.perspective > 0 and rng.random() < args.perspective_prob:
            fg_s, persp_M = _perspective_warp_rgba(fg_s, args.perspective, rng, np, cv2)

        fw2, fh2 = fg_s.shape[1], fg_s.shape[0]
        tx0, ty0, tw, th = _fg_bbox_from_alpha(fg_s)

        max_x = max(0, bw - fw2)
        max_y = max(0, bh - fh2)
        px = rng.randint(0, max_x) if max_x > 0 else 0
        py = rng.randint(0, max_y) if max_y > 0 else 0

        comp = bg.copy()
        _paste_fg_on_bg(comp, px, py, fg_s)

        # 标注：整靶 alpha 框（无 triangles-json 时使用）或多三角形框
        bx0 = px + tx0
        by0 = py + ty0
        bx1 = px + tx0 + tw
        by1 = py + ty0 + th
        bx0 = max(0, min(bx0, bw - 1))
        by0 = max(0, min(by0, bh - 1))
        bx1 = max(bx0 + 1, min(bx1, bw))
        by1 = max(by0 + 1, min(by1, bh))

        tri_pts_full: list[np.ndarray] = []
        if triangles_full is not None:
            for tri in triangles_full:
                pts_c = _warp_triangle_points(
                    tri,
                    float(fx0),
                    float(fy0),
                    float(fw0),
                    float(fh0),
                    new_w,
                    new_h,
                    persp_M,
                    px,
                    py,
                    np,
                    cv2,
                )
                tri_pts_full.append(pts_c)

        if args.color_jitter > 1e-6:
            comp = _color_jitter_bgr(comp, args.color_jitter, rng, np, cv2)

        if args.blur_max > 1e-6:
            sig = rng.uniform(0.3, args.blur_max)
            k = int(sig * 4) | 1
            comp = cv2.GaussianBlur(comp, (k, k), sig)

        if rng.random() < max(0.0, min(1.0, float(args.motion_prob))):
            comp = _motion_blur_bgr(
                comp,
                rng,
                args.motion_kernel_min,
                args.motion_kernel_max,
                np,
                cv2,
            )

        bh, bw = comp.shape[:2]

        if args.stage2_crop:
            win = _stage2_crop_window(
                bx0,
                by0,
                bx1,
                by1,
                bw,
                bh,
                args.stage2_pad_min,
                args.stage2_pad_max,
                rng,
            )
            if win is None:
                continue
            cx0, cy0, cw, ch = win
            comp = comp[cy0 : cy0 + ch, cx0 : cx0 + cw].copy()
            out_w, out_h = cw, ch

            if triangles_full is not None:
                voc_objects = []
                yolo_lines_list = []
                for pts_c in tri_pts_full:
                    p2 = pts_c.copy()
                    p2[:, 0] -= cx0
                    p2[:, 1] -= cy0
                    pair = _triangle_to_voc_tuple(
                        p2,
                        out_w,
                        out_h,
                        args.class_name,
                        args.triangle_bbox_pad_frac,
                    )
                    if pair is None:
                        continue
                    vo, xyxy = pair
                    voc_objects.append(vo)
                    if want_yolo:
                        yolo_lines_list.append(
                            _yolo_line(args.class_id, xyxy, out_w, out_h)
                        )
                if not args.stage2_allow_partial and len(voc_objects) != len(
                    triangles_full
                ):
                    continue
                if want_voc and not voc_objects:
                    continue
                if want_yolo and not yolo_lines_list:
                    continue
            else:
                nbx0, nby0 = bx0 - cx0, by0 - cy0
                nbx1, nby1 = bx1 - cx0, by1 - cy0
                nbx0 = max(0, min(nbx0, out_w - 1))
                nby0 = max(0, min(nby0, out_h - 1))
                nbx1 = max(nbx0 + 1, min(nbx1, out_w))
                nby1 = max(nby0 + 1, min(nby1, out_h))
                if nbx1 <= nbx0 or nby1 <= nby0:
                    continue
                vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
                    nbx0, nby0, nbx1, nby1, out_w, out_h
                )
                voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)]
                yolo_lines_list = (
                    [_yolo_line(args.class_id, (nbx0, nby0, nbx1, nby1), out_w, out_h)]
                    if want_yolo
                    else []
                )
        else:
            out_w, out_h = bw, bh
            if triangles_full is not None:
                voc_objects = []
                yolo_lines_list = []
                for pts_c in tri_pts_full:
                    pair = _triangle_to_voc_tuple(
                        pts_c,
                        out_w,
                        out_h,
                        args.class_name,
                        args.triangle_bbox_pad_frac,
                    )
                    if pair is None:
                        continue
                    vo, xyxy = pair
                    voc_objects.append(vo)
                    if want_yolo:
                        yolo_lines_list.append(
                            _yolo_line(args.class_id, xyxy, out_w, out_h)
                        )
                if want_voc and not voc_objects:
                    continue
                if want_yolo and not yolo_lines_list:
                    continue
            else:
                vx0, vy0, vx1, vy1 = _xyxy_exclusive_to_voc_inclusive(
                    bx0, by0, bx1, by1, out_w, out_h
                )
                voc_objects = [(args.class_name, vx0, vy0, vx1, vy1)]
                yolo_lines_list = (
                    [_yolo_line(args.class_id, (bx0, by0, bx1, by1), out_w, out_h)]
                    if want_yolo
                    else []
                )

        stem = f"synth_{i_done:06d}"
        img_name = stem + ".jpg"
        img_path = os.path.join(out_img, img_name)
        cv2.imwrite(img_path, comp, [int(cv2.IMWRITE_JPEG_QUALITY), args.jpeg_quality])

        if want_voc:
            xml_path = os.path.join(out_xml, stem + ".xml")
            _write_pascal_voc_xml(
                xml_path,
                img_filename=img_name,
                img_folder="images",
                img_w=out_w,
                img_h=out_h,
                depth=3,
                objects=voc_objects,
            )
        if want_yolo:
            lbl_path = os.path.join(out_lbl, stem + ".txt")
            with open(lbl_path, "w", encoding="utf-8") as f:
                f.writelines(yolo_lines_list)

        i_done += 1
        if i_done % 50 == 0:
            print(f"  ... {i_done}/{n_gen}")

    parts = [out_img]
    if want_voc:
        parts.append(out_xml)
    if want_yolo:
        parts.append(out_lbl)
    print(f"[OK] 完成: " + " , ".join(parts))

    if args.zip:
        if not want_voc:
            print("[WARN] --zip 需要 VOC 标注目录 xml/，当前格式未生成 xml，跳过打包")
        else:
            try:
                _zip_images_xml(args.out, args.zip)
                print(f"[OK] 已打包: {os.path.abspath(args.zip)}")
            except OSError as e:
                print(f"[ERR] 打包失败: {e}")
                sys.exit(1)


if __name__ == "__main__":
    main()