crop-screenshot/crop.py at main · visualdiffer/crop-screenshot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/usr/bin/env python3
"""Extracts the foreground element (menu/dialog) from a screenshot and saves it
as RGBA PNG with background/rounded corners made transparent.

Usage:
    crop.py INPUT... [--seed X,Y] [--bbox X,Y,W,H]
            [--tolerance N] [--margin N] [--outdir DIR] [--debug]

Output is written to <outdir>/<original name> (default outdir = "cropped"),
keeping the same filename as the input.
"""

from __future__ import annotations

import argparse
import glob
import os
import sys

import cv2
import numpy as np


# --------------------------------------------------------------------------- #
# Foreground element detection
# --------------------------------------------------------------------------- #
def detect_element(img, kind="auto", debug_path=None):
    """Returns (x, y, w, h, score) of the best candidate, or None.

    The foreground element (menu/dialog) is a rounded rectangle that
    "floats" above the interface: its contour does not touch the image
    edges, unlike embedded panels/windows that reach the edges.
    Among floating contours of plausible size we pick the largest;
    the presence of an outer shadow halo is a confidence bonus.
    """
    H, W = img.shape[:2]
    img_area = float(W * H)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    edges = cv2.Canny(gray, 30, 120)
    k = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, k, iterations=2)

    contours, _ = cv2.findContours(closed, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    # collect floating contours (bbox that does not touch the image edges)
    cands = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        area = w * h
        frac = area / img_area
        if frac < 0.02 or frac > 0.95:
            continue
        touches = int(x <= 2) + int(y <= 2) + int(x + w >= W - 2) + int(y + h >= H - 2)
        if touches > 0:
            continue
        shadow = shadow_score(gray, x, y, w, h, W, H)
        cands.append({"x": x, "y": y, "w": w, "h": h, "area": area,
                      "frac": frac, "shadow": shadow, "contour": c})

    # discard large containers (e.g. the whole app window): a wide bounding box
    # that contains a smaller one is a container, not the overlay.
    def contains(a, b, tol=2):
        return (a["x"] <= b["x"] + tol and a["y"] <= b["y"] + tol and
                a["x"] + a["w"] >= b["x"] + b["w"] - tol and
                a["y"] + a["h"] >= b["y"] + b["h"] - tol and
                b["area"] < 0.95 * a["area"])

    kept = []
    for a in cands:
        is_container = a["frac"] > 0.6 and any(contains(a, b) for b in cands if b is not a)
        if not is_container:
            kept.append(a)

    pool = kept or cands

    # a context menu is a compact popup: discard very wide and short strips
    # (e.g. toolbars) that would otherwise win by area.
    if kind == "menu":
        compact = [c for c in pool if 0.15 <= c["w"] / c["h"] <= 2.5]
        pool = compact or pool
        # for menus the popup shadow is a strong signal: weight it more
        for c in pool:
            c["score"] = c["frac"] + 0.6 * max(0.0, c["shadow"])
    else:
        for c in pool:
            c["score"] = c["frac"] + 0.3 * max(0.0, c["shadow"])

    best = max(pool, key=lambda c: c["score"]) if pool else None

    if debug_path:
        dbg = img.copy()
        for c in sorted(pool, key=lambda c: c["score"], reverse=True)[:8]:
            cv2.rectangle(dbg, (c["x"], c["y"]), (c["x"] + c["w"], c["y"] + c["h"]), (0, 0, 255), 2)
            cv2.putText(dbg, f"{c['score']:.2f}", (c["x"] + 4, c["y"] + 24),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
        if best is not None:
            cv2.rectangle(dbg, (best["x"], best["y"]),
                          (best["x"] + best["w"], best["y"] + best["h"]), (0, 255, 0), 3)
        cv2.imwrite(debug_path, dbg)

    if best is None:
        return None
    return (best["x"], best["y"], best["w"], best["h"], best["score"], best["contour"])


def shadow_score(gray, x, y, w, h, W, H, band=8):
    """How much darker the outer ring around the bbox is compared to the inner ring.

    High value => the element is surrounded by shadow/dimmed background => overlay.
    """
    def ring_mean(x0, y0, x1, y1):
        x0 = max(0, x0); y0 = max(0, y0)
        x1 = min(W, x1); y1 = min(H, y1)
        if x1 <= x0 or y1 <= y0:
            return None
        return float(gray[y0:y1, x0:x1].mean())

    # bands on all four sides, just outside and just inside the bbox
    outs, ins = [], []
    # top
    outs.append(ring_mean(x, y - band, x + w, y))
    ins.append(ring_mean(x, y, x + w, y + band))
    # bottom
    outs.append(ring_mean(x, y + h, x + w, y + h + band))
    ins.append(ring_mean(x, y + h - band, x + w, y + h))
    # left
    outs.append(ring_mean(x - band, y, x, y + h))
    ins.append(ring_mean(x, y, x + band, y + h))
    # right
    outs.append(ring_mean(x + w, y, x + w + band, y + h))
    ins.append(ring_mean(x + w - band, y, x + w, y + h))

    diffs = [(i - o) / 255.0 for o, i in zip(outs, ins) if o is not None and i is not None]
    if not diffs:
        return 0.0
    # mean of differences (inner - outer): positive when outside is darker
    return float(np.mean(diffs))


# --------------------------------------------------------------------------- #
# Background transparency / rounded corners
# --------------------------------------------------------------------------- #
def mask_from_contour(shape, contour, shrink=2):
    """Full (0/255) mask of the element shape from its contour.

    Fills the detected contour and softens its edge so the alpha channel
    follows the exact outline of the element (including rounded corners)
    without including the surrounding background. ``shrink`` erodes the mask
    by a few pixels to discard background/shadow bleed that the contour may
    include at the edges.
    """
    mask = np.zeros(shape[:2], np.uint8)
    cv2.drawContours(mask, [contour], -1, 255, thickness=cv2.FILLED)
    # close small holes/irregularities in the contour
    k = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, k, iterations=2)
    # shrink the edge inward to remove colored background bleed
    if shrink > 0:
        ek = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2 * shrink + 1, 2 * shrink + 1))
        mask = cv2.erode(mask, ek, iterations=1)
    return mask


def make_transparent(crop, tolerance):
    """Makes the background connected to the 4 corners of the crop transparent.

    floodFill starts from each corner: fills connected similar background pixels
    (corner triangles from rounded corners, any residual shadow) leaving the
    element opaque. Returns a BGRA image.
    """
    h, w = crop.shape[:2]
    mask = np.zeros((h + 2, w + 2), np.uint8)
    flags = (4 | (255 << 8) | cv2.FLOODFILL_MASK_ONLY | cv2.FLOODFILL_FIXED_RANGE)
    lo = (tolerance,) * 3
    hi = (tolerance,) * 3
    scratch = crop.copy()
    for seed in [(0, 0), (w - 1, 0), (0, h - 1), (w - 1, h - 1)]:
        cv2.floodFill(scratch, mask, seed, 0, lo, hi, flags)

    bg = mask[1:-1, 1:-1] > 0
    alpha = np.where(bg, 0, 255).astype(np.uint8)
    # slight feather to soften rounded edges
    alpha = cv2.GaussianBlur(alpha, (3, 3), 0)

    out = cv2.cvtColor(crop, cv2.COLOR_BGR2BGRA)
    out[:, :, 3] = alpha
    return out


def trim_transparent(bgra):
    """Trims fully transparent rows/columns from the edges."""
    alpha = bgra[:, :, 3]
    ys, xs = np.where(alpha > 0)
    if len(xs) == 0:
        return bgra
    x0, x1 = xs.min(), xs.max() + 1
    y0, y1 = ys.min(), ys.max() + 1
    return bgra[y0:y1, x0:x1]


# --------------------------------------------------------------------------- #
# Single-file pipeline
# --------------------------------------------------------------------------- #
def process(path, args):
    img = cv2.imread(path, cv2.IMREAD_COLOR)
    if img is None:
        print(f"[error] cannot read {path}", file=sys.stderr)
        return False
    H, W = img.shape[:2]
    contour = None  # exact element shape, if available from detection

    # --- determine the bounding box (and optional contour) of the element ---
    if args.bbox:
        x, y, w, h = args.bbox
    elif args.seed:
        x, y, w, h = grow_from_seed(img, args.seed, args.tolerance)
    else:
        dbg_path = None
        if args.debug:
            stem = os.path.splitext(os.path.basename(path))[0]
            dbg_path = f"_debug_{stem}.png"
        det = detect_element(img, kind=args.type, debug_path=dbg_path)
        if det is None or det[4] < args.min_score:
            sc = "n/d" if det is None else f"{det[4]:.2f}"
            print(f"[warning] {os.path.basename(path)}: uncertain detection "
                  f"(score={sc}). Use --seed X,Y or --bbox X,Y,W,H.", file=sys.stderr)
            return False
        x, y, w, h, _, contour = det

    # margin + clamp to image edges
    m = args.margin
    x0 = max(0, x - m); y0 = max(0, y - m)
    x1 = min(W, x + w + m); y1 = min(H, y + h + m)
    crop = img[y0:y1, x0:x1]

    # --- transparency ---
    if contour is not None:
        # alpha = exact element shape (filled contour): no background
        # enters the result, rounded corners become transparent.
        alpha_full = mask_from_contour(img.shape, contour, shrink=args.shrink)
        alpha = alpha_full[y0:y1, x0:x1]
        alpha = cv2.GaussianBlur(alpha, (3, 3), 0)
        bgra = cv2.cvtColor(crop, cv2.COLOR_BGR2BGRA)
        bgra[:, :, 3] = alpha
    else:
        # fallback (--seed/--bbox): cleans background connected to the 4 corners
        bgra = make_transparent(crop, args.tolerance)
    bgra = trim_transparent(bgra)

    os.makedirs(args.outdir, exist_ok=True)
    out_path = os.path.join(args.outdir, os.path.basename(path))
    cv2.imwrite(out_path, bgra)
    print(f"[ok] {path} -> {out_path}  ({bgra.shape[1]}x{bgra.shape[0]})")
    return True


def grow_from_seed(img, seed, tolerance):
    """Expands a region from the seed point and returns its bounding box."""
    h, w = img.shape[:2]
    mask = np.zeros((h + 2, w + 2), np.uint8)
    flags = (4 | (255 << 8) | cv2.FLOODFILL_MASK_ONLY | cv2.FLOODFILL_FIXED_RANGE)
    lo = (tolerance,) * 3
    hi = (tolerance,) * 3
    cv2.floodFill(img.copy(), mask, tuple(seed), 0, lo, hi, flags)
    region = mask[1:-1, 1:-1] > 0
    ys, xs = np.where(region)
    if len(xs) == 0:
        return (seed[0], seed[1], 1, 1)
    return (int(xs.min()), int(ys.min()),
            int(xs.max() - xs.min() + 1), int(ys.max() - ys.min() + 1))


# --------------------------------------------------------------------------- #
def parse_pair(s, n):
    parts = s.split(",")
    if len(parts) != n:
        raise argparse.ArgumentTypeError(f"expected {n} comma-separated values")
    return [int(p) for p in parts]


def main(argv=None):
    p = argparse.ArgumentParser(description="Extracts the foreground element from screenshots.")
    p.add_argument("inputs", nargs="+", help="input images")
    p.add_argument("--outdir", default="cropped", help="output directory (default: cropped)")
    p.add_argument("--type", choices=["auto", "menu", "dialog"], default="auto",
                   help="element type to extract (default: auto)")
    p.add_argument("--seed", type=lambda s: parse_pair(s, 2), help="internal point X,Y (manual fallback)")
    p.add_argument("--bbox", type=lambda s: parse_pair(s, 4), help="bounding box X,Y,W,H (manual fallback)")
    p.add_argument("--tolerance", type=int, default=24, help="color tolerance for floodFill (default: 24)")
    p.add_argument("--margin", type=int, default=3, help="extra margin around crop in px (default: 3)")
    p.add_argument("--shrink", type=int, default=2,
                   help="border erosion pixels to remove background bleed (default: 2)")
    p.add_argument("--min-score", type=float, default=0.04, dest="min_score",
                   help="minimum confidence threshold for auto detection (default: 0.04)")
    p.add_argument("--debug", action="store_true", help="save _debug_<name>.png with highlighted candidates")
    args = p.parse_args(argv)

    # expand glob patterns (e.g. "screen*"), useful if the shell does not expand them
    paths = []
    for pattern in args.inputs:
        matched = sorted(glob.glob(pattern))
        if matched:
            paths.extend(matched)
        elif any(ch in pattern for ch in "*?[]"):
            print(f"[warning] no files match {pattern!r}", file=sys.stderr)
        else:
            paths.append(pattern)  # literal path: let process() report the error

    if not paths:
        print("[error] no input files.", file=sys.stderr)
        return 1

    ok = 0
    for path in paths:
        if process(path, args):
            ok += 1
    print(f"\nCompleted {ok}/{len(paths)} files.")
    return 0 if ok == len(paths) else 1


if __name__ == "__main__":
    raise SystemExit(main())