Sam3_auto/detect.py at main · Autismab/Sam3_auto · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
在线阶段：加载模板特征，对新图片做零样本检测。

用法:
    python detect.py --image test.jpg --db templates.db --checkpoint sam3.pt
    python detect.py --image test.jpg --db templates.db --checkpoint sam3.pt --defect_type scratch
    python detect.py --image_dir ./images/ --db templates.db --checkpoint sam3.pt
"""
import sys
import argparse
import time
from pathlib import Path
from typing import Optional

import torch
import numpy as np
from PIL import Image, ImageDraw

SAM3_ROOT = Path("E:/code/sam/sam3")
if str(SAM3_ROOT) not in sys.path:
    sys.path.insert(0, str(SAM3_ROOT))

from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.model.data_misc import interpolate
from sam3.model import box_ops

from template_manager import TemplateManager
from sam3_template_model import patch_model_for_template_injection


def cuda_sync(device: str = "cuda"):
    if isinstance(device, str) and device.startswith("cuda") and torch.cuda.is_available():
        torch.cuda.synchronize()


def detect_single(
    image_path: str,
    processor: Sam3Processor,
    model,
    template_features: torch.Tensor,
    template_ref_boxes: Optional[torch.Tensor],
    cached_text_out: Optional[dict] = None,
    device: str = "cuda",
    confidence_threshold: float = 0.3,
    text_prompt: Optional[str] = None,
    use_templates: bool = True,
    use_masks: bool = True,
    use_text: bool = True,
):
    """
    对单张图片做模板注入检测。

    Returns:
        dict: {boxes, masks, scores, image}
    """
    timings = {}

    t0 = time.perf_counter()
    image = Image.open(image_path).convert("RGB")
    img_w, img_h = image.size
    timings["image_load"] = time.perf_counter() - t0

    with torch.inference_mode():
        cuda_sync(device)
        t0 = time.perf_counter()
        state = processor.set_image(image)
        cuda_sync(device)
        timings["image_encode"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        if cached_text_out is not None:
            state["backbone_out"].update(cached_text_out)
        else:
            prompt_to_use = text_prompt if (use_text and text_prompt) else "visual"
            text_out = model.backbone.forward_text([prompt_to_use], device=device)
            state["backbone_out"].update(text_out)
        cuda_sync(device)
        timings["text_encode"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        if "geometric_prompt" not in state:
            state["geometric_prompt"] = model._get_dummy_prompt()
        cuda_sync(device)
        timings["prompt_prepare"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        if use_templates:
            model.set_templates(template_features, template_ref_boxes)
        else:
            model.clear_templates()
        cuda_sync(device)
        timings["template_prepare"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        find_stage = processor.find_stage
        out = model.forward_grounding(
            backbone_out=state["backbone_out"],
            find_input=find_stage,
            geometric_prompt=state["geometric_prompt"],
            find_target=None,
        )
        cuda_sync(device)
        timings["model_forward"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        pred_logits = out["pred_logits"]  # (batch, nq, 1)
        pred_boxes = out["pred_boxes"]    # (batch, nq, 4) cxcywh normalized
        pred_masks = out.get("pred_masks")  # (batch, nq, H, W)

        presence_score = out["presence_logit_dec"].sigmoid().unsqueeze(1)
        probs = pred_logits.sigmoid()
        probs = (probs * presence_score).squeeze(-1).squeeze(0)  # (nq,)

        keep = probs > confidence_threshold
        scores = probs[keep]
        boxes_cxcywh = pred_boxes.squeeze(0)[keep]
        boxes_xyxy = box_ops.box_cxcywh_to_xyxy(boxes_cxcywh)
        scale = torch.tensor([img_w, img_h, img_w, img_h], device=device)
        boxes_xyxy = boxes_xyxy * scale
        cuda_sync(device)
        timings["box_postprocess"] = time.perf_counter() - t0

        cuda_sync(device)
        t0 = time.perf_counter()
        masks = None
        if use_masks and pred_masks is not None:
            masks = pred_masks.squeeze(0)[keep]
            masks = interpolate(
                masks.unsqueeze(1),
                (img_h, img_w),
                mode="bilinear",
                align_corners=False,
            ).sigmoid()
            masks = masks > 0.5
        cuda_sync(device)
        timings["mask_postprocess"] = time.perf_counter() - t0

    timings["total"] = sum(timings.values())

    return {
        "boxes": boxes_xyxy,
        "masks": masks,
        "scores": scores,
        "image": image,
        "timings": timings,
    }


def visualize(result: dict, save_path: Optional[str] = None):
    """可视化检测结果"""
    image = result["image"].copy().convert("RGBA")
    boxes = result["boxes"]
    scores = result["scores"]
    masks = result["masks"]
    img_w, img_h = image.size

    # 画 mask
    if masks is not None and len(masks) > 0:
        colors = [
            (255, 0, 0, 100), (0, 255, 0, 100), (0, 0, 255, 100),
            (255, 255, 0, 100), (255, 0, 255, 100), (0, 255, 255, 100),
        ]
        overlay = Image.new("RGBA", (img_w, img_h), (0, 0, 0, 0))
        for i in range(min(len(masks), 10)):
            mask = masks[i]
            if isinstance(mask, torch.Tensor):
                mask = mask.detach().cpu().numpy()
            if mask.ndim == 3:
                mask = mask[0]
            mask_bin = (mask > 0.5).astype(np.uint8) * 255
            mask_pil = Image.fromarray(mask_bin).resize((img_w, img_h), Image.NEAREST)
            color = colors[i % len(colors)]
            color_img = Image.new("RGBA", (img_w, img_h), color)
            overlay = Image.composite(color_img, overlay, mask_pil)
        image = Image.alpha_composite(image, overlay)

    # 画框和分数
    draw = ImageDraw.Draw(image)
    for i in range(len(boxes)):
        box = boxes[i]
        score = float(scores[i])
        if isinstance(box, torch.Tensor):
            box = box.detach().cpu().tolist()
        x1, y1, x2, y2 = box
        draw.rectangle([x1, y1, x2, y2], outline=(0, 255, 0, 255), width=3)
        draw.text((x1 + 3, max(0, y1 - 15)), f"{score:.2f}", fill=(0, 255, 0, 255))

    if save_path:
        image.save(save_path)
        print(f"结果已保存: {save_path}")
    else:
        image.show()

    return image


def main():
    parser = argparse.ArgumentParser(description="SAM3 模板注入零样本检测")
    parser.add_argument("--image", help="单张图片路径")
    parser.add_argument("--image_dir", default=r"E:\code\yolov10-main\datasets\Data\val\images",help="图片目录（批量检测）")
    parser.add_argument("--db", default="templates.db", help="模板数据库路径")
    parser.add_argument("--checkpoint", default="E:/code/sam/sam3/sam3.pt")
    parser.add_argument("--bpe", default=None)
    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
    parser.add_argument("--threshold", type=float, default=0.5, help="置信度阈值")
    parser.add_argument("--defect_type", default="1", help="只加载指定缺陷类型的模板")
    parser.add_argument("--text_prompt", default="snake", help="可选的文本提示词")
    parser.add_argument("--save_dir", default="E:\lora-scripts\sd-models\sam3_auto\out", help="结果保存目录")
    parser.add_argument("--no_templates", action="store_true", help="禁用模板注入，做对照测试")
    parser.add_argument("--no_masks", action="store_true", help="禁用 mask 后处理，做对照测试")
    parser.add_argument("--no_text", action="store_true", help="禁用真实文本提示词，改用 visual 占位文本做对照测试")
    args = parser.parse_args()

    args.no_templates = False  # True=关闭模板注入
    args.no_masks = False  # True=关闭 mask 后处理
    args.no_text = False  # True=关闭文本编码

    # 加载模型
    print(f"加载模型: {args.checkpoint}")
    model_load_start = time.perf_counter()
    bpe_path = args.bpe or str(SAM3_ROOT / "assets" / "bpe_simple_vocab_16e6.txt.gz")
    model = build_sam3_image_model(
        checkpoint_path=args.checkpoint,
        bpe_path=bpe_path,
        device=args.device,
        eval_mode=True,
    )
    model = patch_model_for_template_injection(model)
    processor = Sam3Processor(model, device=args.device)
    model_load_time = time.perf_counter() - model_load_start
    print(f"模型加载耗时: {model_load_time:.4f} 秒")
    print(f"开关状态: templates={'off' if args.no_templates else 'on'}, masks={'off' if args.no_masks else 'on'}, text={'off' if args.no_text else 'on'}")

    # 预编码文本，避免每张图重复跑文本分支
    prompt_to_use = args.text_prompt if (not args.no_text and args.text_prompt) else "visual"
    cuda_sync(args.device)
    text_cache_start = time.perf_counter()
    cached_text_out = model.backbone.forward_text([prompt_to_use], device=args.device)
    cuda_sync(args.device)
    text_cache_time = time.perf_counter() - text_cache_start
    print(f"文本预编码耗时: {text_cache_time:.4f} 秒 (prompt={prompt_to_use})")

    # 加载模板
    tm = TemplateManager(args.db)
    count = tm.count(args.defect_type)
    print(f"加载模板: {count} 条" + (f" (类型: {args.defect_type})" if args.defect_type else ""))
    template_features = tm.load_features_tensor(args.defect_type, args.device)

    # 加载 ref_boxes（直接使用离线构建时保存的归一化 cxcywh）
    templates = tm.load(args.defect_type, args.device)
    if templates:
        ref_boxes = [list(t["ref_box"]) for t in templates]
        template_ref_boxes = torch.tensor(ref_boxes, dtype=torch.float32, device=args.device)
    else:
        template_ref_boxes = None

    tm.close()

    # 收集图片
    image_paths = []
    if args.image:
        image_paths.append(args.image)
    elif args.image_dir:
        img_dir = Path(args.image_dir)
        image_paths = sorted(
            str(p) for p in img_dir.glob("*")
            if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".bmp")
        )

    if not image_paths:
        print("请指定 --image 或 --image_dir")
        return

    save_dir = Path(args.save_dir) if args.save_dir else None
    if save_dir:
        save_dir.mkdir(parents=True, exist_ok=True)

    # 检测
    total_infer_time = 0.0
    for img_path in image_paths:
        print(f"\n检测: {img_path}")
        infer_start = time.perf_counter()
        result = detect_single(
            img_path, processor, model,
            template_features, template_ref_boxes, cached_text_out,
            args.device, args.threshold, args.text_prompt,
            use_templates=not args.no_templates,
            use_masks=not args.no_masks,
            use_text=not args.no_text,
        )
        infer_time = time.perf_counter() - infer_start
        total_infer_time += infer_time
        n = len(result["scores"])
        timings = result.get("timings", {})
        print(f"  检测到 {n} 个目标")
        print(f"  推理耗时: {infer_time:.4f} 秒")
        print(f"    图片读取: {timings.get('image_load', 0.0):.4f} 秒")
        print(f"    图像编码: {timings.get('image_encode', 0.0):.4f} 秒")
        print(f"    文本编码: {timings.get('text_encode', 0.0):.4f} 秒")
        print(f"    Prompt准备: {timings.get('prompt_prepare', 0.0):.4f} 秒")
        print(f"    模板注入准备: {timings.get('template_prepare', 0.0):.4f} 秒")
        print(f"    模型前向: {timings.get('model_forward', 0.0):.4f} 秒")
        print(f"    框后处理: {timings.get('box_postprocess', 0.0):.4f} 秒")
        print(f"    Mask后处理: {timings.get('mask_postprocess', 0.0):.4f} 秒")
        print(f"    分项合计: {timings.get('total', 0.0):.4f} 秒")

        vis_start = time.perf_counter()
        save_path = str(save_dir / (Path(img_path).stem + "_result.png")) if save_dir else None
        visualize(result, save_path)
        vis_time = time.perf_counter() - vis_start
        print(f"    可视化保存: {vis_time:.4f} 秒")

    if image_paths:
        print(f"\n共处理 {len(image_paths)} 张图片")
        print(f"总推理耗时: {total_infer_time:.4f} 秒")
        print(f"平均每张推理耗时: {total_infer_time / len(image_paths):.4f} 秒")


if __name__ == "__main__":
    main()