snapsift/pick.py at main · CVERInc/snapsift · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""
snapsift / pick.py
===================

Given the groups.json from scan.py, decide *which* photo to keep in each
cluster and which ones to delete. Outputs plan.json and delete-uuids.txt.

Keeper heuristic (in order):
  1. Never delete a favorite. A favorited frame always survives, and if a
     cluster has several favorites every one of them is kept.
  2. Prefer Apple's own per-photo quality score (sharpness, framing,
     timing, low noise…) — the genuinely better frame, not just the biggest
     file. Scores are quantised before comparison so near-ties fall through
     to the format/size tiebreakers rather than splitting hairs on noise.
  3. Prefer original-format files (HEIC > JPG > PNG > MP4) — favors the
     iPhone-native capture over edited/shared/forwarded versions.
  4. Among same-UTI, keep the largest file size — proxy for "highest
     quality version" (more bits = less compression).
  5. If still tied, keep the earliest one (the original take).

We never need pixel access: Apple already computed the aesthetic scores and
we trust that "bigger same-format file" usually means "original / less
compressed". Anything not the keeper *and* not a favorite gets deleted.

Usage:
    python3 pick.py --input groups.json --output plan.json \\
                    --uuid-out delete-uuids.txt
"""

from __future__ import annotations
import argparse, json, sys
from pathlib import Path
from collections import Counter


# Higher score = preferred to keep.
# This roughly maps "original iPhone capture" > "shared/converted version".
UTI_PRIORITY = {
    "public.heic":          100,
    "public.heif":          100,
    "public.jpeg":           80,
    "public.png":            60,
    "public.tiff":           50,
    "com.compuserve.gif":    20,
    "public.mpeg-4":         70,   # videos rank below same-quality stills
    "com.apple.quicktime-movie": 70,
}


def rank(p: dict) -> tuple:
    """Sort key for 'most worth keeping' — higher is better.

    favorite first, then Apple's quality score (quantised so noise-level
    differences don't override the format/size signal), then UTI priority,
    then file size, then earliest take.
    """
    return (
        1 if p.get("favorite") else 0,
        round(p.get("quality") or 0.0, 1),
        UTI_PRIORITY.get(p["uti"], 0),
        p["size"],
        -p["taken_at"],
    )


def keeper(group: list[dict]) -> dict:
    """Pick the one photo from the cluster to keep."""
    return max(group, key=rank)


def main():
    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--input",      type=Path, default=Path("groups.json"))
    ap.add_argument("--output",     type=Path, default=Path("plan.json"))
    ap.add_argument("--uuid-out",   type=Path, default=Path("delete-uuids.txt"),
                    help="Plain newline-delimited UUIDs to feed delete.applescript")
    ap.add_argument("--max-groups", type=int, default=None,
                    help="Only emit first N clusters (handy for cautious first-pass)")
    args = ap.parse_args()

    data = json.loads(args.input.read_text())

    plan_groups = []
    delete_uuids: list[str] = []
    kept_format = Counter()
    deleted_format = Counter()
    kept_bytes = 0
    deleted_bytes = 0

    groups = data["groups"][: args.max_groups] if args.max_groups else data["groups"]
    for g in groups:
        keep = keeper(g["photos"])
        # Delete everything that is neither the keeper nor a favorite. Favorites
        # are sacred — a cluster of all-favorites deletes nothing.
        deletes = [p for p in g["photos"]
                   if p["uuid"] != keep["uuid"] and not p.get("favorite")]
        plan_groups.append({
            "size":     g["size"],
            "span_sec": g["span_sec"],
            "keep":     keep,
            "delete":   deletes,
        })
        kept_format[keep["uti"] or "(none)"] += 1
        kept_bytes += keep["size"]
        for d in deletes:
            delete_uuids.append(d["uuid"])
            deleted_format[d["uti"] or "(none)"] += 1
            deleted_bytes += d["size"]

    plan = {
        "source":  str(args.input),
        "stats": {
            "groups":         len(plan_groups),
            "kept":           len(plan_groups),
            "deleted":        len(delete_uuids),
            "kept_bytes":     kept_bytes,
            "deleted_bytes":  deleted_bytes,
            "kept_formats":   dict(kept_format),
            "deleted_formats": dict(deleted_format),
        },
        "groups": plan_groups,
    }
    args.output.write_text(json.dumps(plan, indent=2, ensure_ascii=False))
    args.uuid_out.write_text("\n".join(delete_uuids) + "\n")

    print(f"✅ {args.output} — {len(plan_groups):,} groups")
    print(f"✅ {args.uuid_out} — {len(delete_uuids):,} UUIDs to delete")
    print()
    print(f"Kept   : {len(plan_groups):,} photos, {kept_bytes/1e9:.2f} GB")
    print(f"Delete : {len(delete_uuids):,} photos, {deleted_bytes/1e9:.2f} GB")
    print()
    print(f"Keep format mix:")
    for fmt, n in kept_format.most_common():
        print(f"  {fmt:30s} {n:>6,}")
    print(f"Delete format mix:")
    for fmt, n in deleted_format.most_common():
        print(f"  {fmt:30s} {n:>6,}")


if __name__ == "__main__":
    main()