GhostPrompt/scan.py at main · Tuguberk/GhostPrompt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
#!/usr/bin/env python3
"""
GhostPrompt — Standalone Script
Runs all 8 GhostPrompt scanning layers in a single pass.

Usage:
    python3 scan.py <pdf_file>

Example:
    python3 scan.py /mnt/user-data/uploads/document.pdf
"""

import sys
import re
import zlib
import base64
from pathlib import Path
from datetime import datetime

try:
    from pdfminer.high_level import extract_text, extract_pages
    from pdfminer.layout import LTChar
    PDFMINER_AVAILABLE = True
except ImportError:
    PDFMINER_AVAILABLE = False
    print("[WARN] pdfminer.six not installed. Run: pip install pdfminer.six --break-system-packages")

try:
    import pikepdf
    PIKEPDF_AVAILABLE = True
except ImportError:
    PIKEPDF_AVAILABLE = False

# ─────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────

def section(title):
    print(f"\n{'='*60}")
    print(f"  {title}")
    print(f"{'='*60}")

def ok(msg):
    print(f"  ✓  {msg}")

def warn(msg):
    print(f"  ⚠️  {msg}")

def info(msg):
    print(f"  ℹ️  {msg}")

# ─────────────────────────────────────────
# Layer 1: Metadata & Dangerous PDF Features
# ─────────────────────────────────────────

def layer1_dangerous_features(raw_bytes):
    section("LAYER 1 — Metadata & Dangerous PDF Features")
    raw_str = raw_bytes.decode('latin-1')
    results = {}

    print("\n  Metadata:")
    for field in ['Creator', 'Author', 'Producer', 'Subject', 'Title']:
        matches = re.findall(rf'/{field}\s*\(([^)]+)\)', raw_str)
        print(f"    {field}: {matches[0] if matches else 'not found'}")

    print("\n  Dangerous features:")
    checks = {
        'JavaScript':     r'/JavaScript',
        'Launch action':  r'/Launch',
        'OpenAction':     r'/OpenAction',
        'Embedded files': r'/EmbeddedFile',
        'URI links':      r'/URI\s*\(',
        'GoTo Remote':    r'/GoToR',
        'SubmitForm':     r'/SubmitForm',
        'ImportData':     r'/ImportData',
        'RichMedia':      r'/RichMedia',
        'Sound':          r'/Sound\b',
        'Movie':          r'/Movie\b',
    }

    layer_clean = True
    for name, pattern in checks.items():
        found = re.findall(pattern, raw_str)
        if found:
            warn(f"{name}: FOUND ({len(found)})")
            layer_clean = False
            results[name] = len(found)
        else:
            ok(f"{name}: clean")

    uris = re.findall(r'/URI\s*\(([^)]+)\)', raw_str)
    if uris:
        print(f"\n  URI contents ({len(uris)}):")
        for u in uris[:10]:
            print(f"    → {u}")

    results['clean'] = layer_clean
    return results

# ─────────────────────────────────────────
# Layer 2: Invisible Text Detection
# ─────────────────────────────────────────

def layer2_invisible_text(raw_bytes, pdf_path):
    section("LAYER 2 — Invisible Text Detection")
    raw_str = raw_bytes.decode('latin-1')
    results = {}

    tr3 = re.findall(r'\b3\s+Tr\b', raw_str)
    if tr3:
        warn(f"Invisible render mode (3 Tr): {len(tr3)} occurrences")
        results['invisible_render'] = len(tr3)
    else:
        ok("Invisible render mode (3 Tr): none")

    white_rgb = re.findall(r'1\s+1\s+1\s+rg', raw_str)
    if white_rgb:
        warn(f"White RGB color (1 1 1 rg): {len(white_rgb)} occurrences")
        results['white_color'] = len(white_rgb)
    else:
        ok("White RGB color: none")

    if PDFMINER_AVAILABLE:
        tiny_chars = []
        try:
            with open(pdf_path, 'rb') as f:
                for page_num, page_layout in enumerate(extract_pages(f)):
                    for element in page_layout:
                        if hasattr(element, '__iter__'):
                            for line in element:
                                if hasattr(line, '__iter__'):
                                    for char in line:
                                        if isinstance(char, LTChar):
                                            if char.size < 1.0 and char.get_text().strip():
                                                tiny_chars.append({
                                                    'page': page_num + 1,
                                                    'char': char.get_text(),
                                                    'size': round(char.size, 3),
                                                })
        except Exception as e:
            info(f"pdfminer tiny char scan error: {e}")

        if tiny_chars:
            warn(f"Tiny chars (< 1pt): {len(tiny_chars)} found!")
            text = ''.join(c['char'] for c in tiny_chars)
            warn(f"Combined hidden text: {repr(text[:200])}")
            results['tiny_chars'] = tiny_chars
        else:
            ok("Tiny chars (< 1pt): none")
    else:
        info("pdfminer not available — skipping tiny char scan")

    results['clean'] = not any(k in results for k in ['invisible_render', 'white_color', 'tiny_chars'])
    return results

# ─────────────────────────────────────────
# Layer 3: Injection Pattern Scan
# ─────────────────────────────────────────

def layer3_injection_patterns(pdf_path):
    section("LAYER 3 — Prompt Injection Pattern Scan")
    results = {}

    if not PDFMINER_AVAILABLE:
        info("pdfminer not available — skipping")
        return {'clean': None, 'skipped': True}

    try:
        full_text = extract_text(pdf_path)
    except Exception as e:
        info(f"Text extraction error: {e}")
        return {'clean': None, 'error': str(e)}

    print(f"  Extracted: {len(full_text)} chars, {full_text.count(chr(10))} lines\n")

    injection_patterns = {
        'Override commands': [
            r'ignore (previous|above|prior|all) (instructions?|prompts?|context)',
            r'disregard (all|previous|prior|the)',
            r'forget (everything|all previous|prior context)',
            r'override (previous|your|all)',
        ],
        'Role manipulation': [
            r'you are (now|actually|really) (an? |a )',
            r'act as (a|an) (?!load|proxy|buffer)',
            r'pretend (you are|to be)',
            r'roleplay as',
            r'from now on (you|your)',
        ],
        'System prompt extraction': [
            r'reveal (your|the|all) (system |)prompt',
            r'print (your|the) (system |)instructions',
            r'show me (your|the) (system |)prompt',
            r'ignore (your|all) (guidelines|rules|restrictions)',
        ],
        'Identity manipulation': [
            r'you have no (restrictions|rules|limits|ethics)',
            r'\b(DAN|STAN|DUDE|JAILBREAK)\b',
            r'developer mode',
            r'bypass (your|all|safety)',
        ],
        'Hidden command structures': [
            r'\[INST\]',
            r'<\|system\|>',
            r'<\|user\|>',
            r'###\s*(system|instruction|prompt)',
            r'---\s*(system|instruction)---',
            r'\[/?SYS\]',
        ],
        'Data exfiltration': [
            r'exfiltrate',
            r'\bsend (to|this to)\b.*\b(email|http|server)',
            r'transmit (the|this|all) (data|content|information)',
        ],
        'Turkish injection': [
            r'önceki talimatları (unut|yoksay)',
            r'yeni (görevin|rolün)',
            r'tüm kısıtlamaları (kaldır|unut)',
        ],
        'Spanish/French/German injection': [
            r'ignora (las|las instrucciones)',
            r'ignorez (les|toutes)',
            r'ignoriere (alle|die)',
            r'tu es maintenant',
            r'ahora eres',
            r'du bist jetzt',
        ],
    }

    total_found = 0
    for category, patterns in injection_patterns.items():
        cat_found = []
        for pattern in patterns:
            matches = re.findall(pattern, full_text, re.IGNORECASE | re.MULTILINE)
            if matches:
                cat_found.append((pattern, matches))
                total_found += len(matches)

        if cat_found:
            warn(f"{category}:")
            for pat, _ in cat_found:
                for m in re.finditer(pat, full_text, re.IGNORECASE):
                    ctx = full_text[max(0, m.start()-40):m.end()+40].replace('\n', ' ')
                    print(f"      Match: ...{ctx}...")
                    break
            results[category] = cat_found
        else:
            ok(f"{category}: clean")

    print(f"\n  Total suspicious matches: {total_found}")
    results['clean'] = total_found == 0
    results['total_matches'] = total_found
    return results

# ─────────────────────────────────────────
# Layer 4: Encoding & Obfuscation
# ─────────────────────────────────────────

def layer4_encoding(pdf_path, raw_bytes):
    section("LAYER 4 — Encoding & Obfuscation Detection")
    results = {}

    if PDFMINER_AVAILABLE:
        try:
            full_text = extract_text(pdf_path)
        except Exception:
            full_text = ""
    else:
        full_text = ""

    raw_str = raw_bytes.decode('latin-1')

    b64_suspicious = []
    b64_candidates = re.findall(r'[A-Za-z0-9+/]{40,}={0,2}', full_text)
    for candidate in b64_candidates:
        try:
            decoded = base64.b64decode(candidate).decode('utf-8', errors='ignore')
            if len(decoded) > 10 and any(c.isalpha() for c in decoded):
                injection_words = ['ignore', 'bypass', 'jailbreak', 'act as', 'you are now']
                if any(w in decoded.lower() for w in injection_words):
                    b64_suspicious.append((candidate[:40], decoded[:100]))
        except Exception:
            pass

    if b64_suspicious:
        warn(f"Suspicious base64 decoded content ({len(b64_suspicious)} found):")
        for encoded, decoded in b64_suspicious:
            print(f"    Encoded: {encoded}...")
            print(f"    Decoded: {repr(decoded)}")
        results['base64'] = b64_suspicious
    else:
        ok(f"Base64: {len(b64_candidates)} candidates, none suspicious")

    suspicious_unicode = []
    for char in set(full_text):
        if ord(char) > 127:
            if 0x0400 <= ord(char) <= 0x04FF:
                suspicious_unicode.append(char)
    if suspicious_unicode:
        warn(f"Cyrillic homoglyph candidates: {suspicious_unicode[:10]}")
        results['homoglyphs'] = suspicious_unicode
    else:
        ok("Unicode homoglyphs (Cyrillic): none detected")

    # Raw binary suspicious strings
    binary_suspicious = [
        b'ignore previous', b'system prompt', b'you are now',
        b'act as', b'jailbreak', b'bypass', b'disregard all'
    ]
    raw_hits = []
    for term in binary_suspicious:
        if term in raw_bytes.lower():
            idx = raw_bytes.lower().index(term)
            context = raw_bytes[max(0, idx-30):idx+len(term)+30]
            raw_hits.append((term.decode(), context))

    if raw_hits:
        warn(f"Suspicious strings in raw binary ({len(raw_hits)} found):")
        for term, context in raw_hits:
            print(f"    Term: '{term}'")
            print(f"    Context: {repr(context)}")
        results['raw_binary'] = raw_hits
    else:
        ok("Raw binary suspicious strings: none")

    # Acrostic check
    if full_text:
        lines = [l.strip() for l in full_text.split('\n') if l.strip() and l.strip()[0].isalpha()]
        if lines:
            first_letters = ''.join(l[0] for l in lines[:100])
            acrostic_hits = [w for w in ['IGNORE', 'SYSTEM', 'JAILBREAK', 'BYPASS', 'OVERRIDE', 'INJECT']
                             if w in first_letters.upper()]
            if acrostic_hits:
                warn(f"Acrostic pattern detected: {acrostic_hits} in first-letter sequence")
                results['acrostic'] = acrostic_hits
            else:
                ok("Acrostic scan: clean")

    results['clean'] = not any(k in results for k in ['base64', 'homoglyphs', 'raw_binary', 'acrostic'])
    return results

# ─────────────────────────────────────────
# Layer 5: Stream Content & Structural Integrity
# ─────────────────────────────────────────

def layer5_streams(raw_bytes):
    section("LAYER 5 — Stream Content & Structural Integrity")
    results = {}

    flat_p = re.compile(rb'<</[^>]*(?:Length|Filter)[^>]*>>\s*\r?\nstream\r?\n', re.DOTALL)
    xmp_p  = re.compile(rb'/Type/Metadata[^>]*>>\s*\r?\nstream\r?\n', re.DOTALL)
    streams_open_count  = len(flat_p.findall(raw_bytes)) + len(xmp_p.findall(raw_bytes))
    streams_close_count = len(re.findall(rb'endstream', raw_bytes))
    print(f"  Streams: {streams_open_count} open, {streams_close_count} closed")
    if streams_open_count != streams_close_count:
        warn(f"Stream count mismatch! ({streams_open_count} vs {streams_close_count})")
        results['stream_mismatch'] = True
    else:
        ok(f"Stream structure: balanced ({streams_open_count} streams)")

    eof_count = len(re.findall(rb'%%EOF', raw_bytes))
    if eof_count > 2:
        warn(f"Multiple PDF revisions: {eof_count} (could indicate hidden content)")
        results['multiple_revisions'] = eof_count
    elif eof_count == 2:
        info(f"PDF revisions: {eof_count} (possibly digitally signed — normal)")
    else:
        ok(f"PDF revisions: {eof_count} (normal)")

    injection_terms = [
        b'ignore previous', b'system prompt', b'act as',
        b'jailbreak', b'bypass', b'you are now', b'disregard all',
        b'new instruction', b'override'
    ]

    stream_pattern = re.compile(rb'/FlateDecode.*?stream\r?\n(.*?)endstream', re.DOTALL)
    all_streams = stream_pattern.findall(raw_bytes)
    hits = []

    for i, stream_data in enumerate(all_streams):
        try:
            decompressed = zlib.decompress(stream_data)
            dec_lower = decompressed.lower()
            for term in injection_terms:
                if term in dec_lower:
                    idx = dec_lower.index(term)
                    context = decompressed[max(0, idx-50):idx+len(term)+50]
                    hits.append({'stream': i, 'term': term.decode(), 'context': context})
        except Exception:
            pass

    if hits:
        warn(f"Injection terms in compressed streams ({len(hits)} hits):")
        for hit in hits:
            print(f"    Stream {hit['stream']}: '{hit['term']}'")
            print(f"    Context: {repr(hit['context'][:100])}")
        results['stream_injection'] = hits
    else:
        ok(f"{len(all_streams)} compressed streams checked: no injection terms")

    results['clean'] = not any(k in results for k in ['stream_mismatch', 'stream_injection'])
    return results

# ─────────────────────────────────────────
# Layer 6: Zero-Width Character (ZWC) Detection
# ─────────────────────────────────────────

def layer6_zwc_detection(pdf_path, raw_bytes):
    section("LAYER 6 — Zero-Width Character (ZWC) Detection")
    results = {}

    if not PDFMINER_AVAILABLE:
        info("pdfminer not available — skipping text-level ZWC scan")
        full_text = ""
    else:
        try:
            full_text = extract_text(pdf_path)
        except Exception as e:
            info(f"Text extraction error: {e}")
            full_text = ""

    zwc_map = {
        '': 'ZERO WIDTH SPACE',
        '‌': 'ZERO WIDTH NON-JOINER',
        '‍': 'ZERO WIDTH JOINER',
        '⁠': 'WORD JOINER',
        '⁡': 'FUNCTION APPLICATION',
        '⁢': 'INVISIBLE TIMES',
        '⁣': 'INVISIBLE SEPARATOR',
        '⁤': 'INVISIBLE PLUS',
        '': 'BYTE ORDER MARK (mid-text)',
        '': 'SOFT HYPHEN',
        '͏': 'COMBINING GRAPHEME JOINER',
        '᠎': 'MONGOLIAN VOWEL SEPARATOR',
    }

    total_zwc = 0
    zwc_hits = {}
    for char, name in zwc_map.items():
        count = full_text.count(char)
        if count > 0:
            total_zwc += count
            zwc_hits[name] = count
            warn(f"{name} (U+{ord(char):04X}): {count} occurrences")
            indices = [i for i, c in enumerate(full_text) if c == char]
            for idx in indices[:3]:
                ctx = full_text[max(0, idx-40):idx+40].replace(char, f'[{name}]')
                print(f"    Context: {repr(ctx)}")

    if total_zwc == 0:
        ok("No zero-width characters found")
    else:
        warn(f"TOTAL ZWC characters: {total_zwc}")
        results['zwc_found'] = zwc_hits

    # Raw UTF-8 ZWC byte scan
    print("\n  Raw UTF-8 ZWC byte scan:")
    zwc_utf8 = {
        b'\xe2\x80\x8b': 'ZERO WIDTH SPACE (UTF-8)',
        b'\xe2\x80\x8c': 'ZERO WIDTH NON-JOINER (UTF-8)',
        b'\xe2\x80\x8d': 'ZERO WIDTH JOINER (UTF-8)',
        b'\xef\xbb\xbf': 'BOM mid-stream (UTF-8)',
    }
    raw_zwc_found = False
    for seq, name in zwc_utf8.items():
        count = raw_bytes.count(seq)
        if count > 0:
            warn(f"{name}: FOUND {count}x")
            results.setdefault('raw_zwc', {})[name] = count
            raw_zwc_found = True
        else:
            ok(f"{name}: clean")

    results['clean'] = 'zwc_found' not in results and not raw_zwc_found
    return results

# ─────────────────────────────────────────
# Layer 7: Annotation Payload & ObjStm Scan
# ─────────────────────────────────────────

def layer7_annotation_objstm(raw_bytes, pdf_path):
    section("LAYER 7 — Annotation Payload & ObjStm Scan")
    results = {}
    raw_str = raw_bytes.decode('latin-1')

    # 7A — Annotation Payload
    print("\n  7A — Annotation Payload:")
    injection_terms = [
        'ignore', 'system prompt', 'act as', 'jailbreak', 'bypass',
        'you are now', 'disregard', 'new objective', 'forget',
    ]

    annot_fields = {
        '/Contents': re.findall(r'/Contents\s*\(([^)]{1,500})\)', raw_str),
        '/T':        re.findall(r'/T\s*\(([^)]{1,200})\)', raw_str),
        '/Subj':     re.findall(r'/Subj\s*\(([^)]{1,200})\)', raw_str),
        '/RC':       re.findall(r'/RC\s*\(([^)]{1,500})\)', raw_str),
    }

    annot_injection_found = False
    any_annot_field = False
    for field, values in annot_fields.items():
        if values:
            any_annot_field = True
            print(f"    {field} fields ({len(values)} found):")
            for v in values[:5]:
                v_clean = v[:200]
                print(f"      Value: {repr(v_clean)}")
                for term in injection_terms:
                    if term.lower() in v.lower():
                        warn(f'injection term "{term}" found in {field}')
                        annot_injection_found = True
                        results.setdefault('annot_injection', []).append((field, term))

    if not any_annot_field:
        ok("No annotation text fields found")
    elif not annot_injection_found:
        ok("Annotation fields present, no injection terms detected")

    # 7B — ObjStm Scan
    print("\n  7B — PDF Object Stream (ObjStm) Scan:")
    if not PIKEPDF_AVAILABLE:
        info("pikepdf not installed — skipping ObjStm scan. Run: pip install pikepdf --break-system-packages")
        results['objstm_skipped'] = True
    else:
        injection_terms_bytes = [
            'ignore previous', 'system prompt', 'act as', 'jailbreak',
            'bypass', 'you are now', 'disregard', 'new objective', 'forget everything',
        ]
        try:
            pdf = pikepdf.open(pdf_path)
            objstm_count = 0
            objstm_hits = []

            for obj in pdf.objects:
                try:
                    if isinstance(obj, pikepdf.Stream):
                        obj_type = str(obj.stream_dict.get('/Type', ''))
                        if 'ObjStm' in obj_type:
                            objstm_count += 1
                            data = obj.read_bytes().lower().decode('utf-8', errors='ignore')
                            for term in injection_terms_bytes:
                                if term in data:
                                    idx = data.index(term)
                                    ctx = data[max(0, idx-50):idx+len(term)+50]
                                    objstm_hits.append({'term': term, 'context': ctx})
                except Exception:
                    pass

            print(f"    ObjStm streams found: {objstm_count}")
            if objstm_hits:
                warn(f"{len(objstm_hits)} injection terms found in ObjStm:")
                for hit in objstm_hits:
                    print(f"      Term: '{hit['term']}'")
                    print(f"      Context: {repr(hit['context'][:100])}")
                results['objstm_injection'] = objstm_hits
            else:
                ok("No injection terms in ObjStm streams")

            pdf.close()
        except Exception as e:
            info(f"pikepdf error: {e}")

    results['clean'] = 'annot_injection' not in results and 'objstm_injection' not in results
    return results

# ─────────────────────────────────────────
# Layer 8: XMP Metadata, Incremental Updates & Semantic Camouflage
# ─────────────────────────────────────────

def layer8_xmp_updates_semantic(raw_bytes, pdf_path):
    section("LAYER 8 — XMP Metadata, Incremental Updates & Semantic Camouflage")
    results = {}
    raw_str = raw_bytes.decode('latin-1')

    injection_terms_str = [
        'ignore previous', 'system prompt', 'act as', 'jailbreak',
        'bypass', 'you are now', 'disregard', 'new objective',
    ]

    # 8A — XMP Metadata Scan
    print("\n  8A — XMP Metadata Scan:")
    xmp_blocks = re.findall(r'<x:xmpmeta.*?</x:xmpmeta>', raw_str, re.DOTALL | re.IGNORECASE)
    print(f"    XMP blocks found: {len(xmp_blocks)}")

    xmp_clean = True
    for i, block in enumerate(xmp_blocks):
        for term in injection_terms_str:
            if term.lower() in block.lower():
                warn(f'"{term}" found in XMP block {i+1}')
                idx = block.lower().index(term.lower())
                print(f"    Context: {repr(block[max(0,idx-60):idx+len(term)+60])}")
                xmp_clean = False
                results.setdefault('xmp_injection', []).append((i+1, term))
        if len(block) > 5000:
            info(f"XMP block {i+1} is large ({len(block)} chars) — inspect manually")

    if xmp_clean:
        ok("No injection terms found in XMP metadata")

    # 8B — Incremental Update Content Scan
    print("\n  8B — Incremental Update Scan:")
    injection_terms_bytes = [b.encode() for b in injection_terms_str]
    eof_positions = [m.start() for m in re.finditer(rb'%%EOF', raw_bytes)]
    print(f"    Revision count: {len(eof_positions)} (%%EOF markers)")

    incremental_hits = []
    if len(eof_positions) > 1:
        for i in range(len(eof_positions) - 1):
            segment = raw_bytes[eof_positions[i]:eof_positions[i+1]]
            seg_lower = segment.lower()
            print(f"\n    Update {i+1}: {len(segment)} bytes")
            seg_hits = []
            for term in injection_terms_bytes:
                if term in seg_lower:
                    idx = seg_lower.index(term)
                    ctx = segment[max(0, idx-40):idx+len(term)+40]
                    seg_hits.append((term.decode(), ctx))
            if seg_hits:
                for term, ctx in seg_hits:
                    warn(f'"{term}" in update {i+1}: {repr(ctx[:100])}')
                    incremental_hits.append((i+1, term))
            else:
                ok(f"Update {i+1}: no injection terms")

            stream_pattern = re.compile(rb'/FlateDecode.*?stream\r?\n(.*?)endstream', re.DOTALL)
            for stream_data in stream_pattern.findall(segment):
                try:
                    decompressed = zlib.decompress(stream_data).lower()
                    for term in injection_terms_bytes:
                        if term in decompressed:
                            warn(f'"{term.decode()}" in compressed stream of update {i+1}')
                            incremental_hits.append((i+1, f"{term.decode()} (compressed)"))
                except Exception:
                    pass
    else:
        ok("Single revision — no incremental update attack surface")

    if incremental_hits:
        results['incremental_injection'] = incremental_hits

    # 8C — Semantic Camouflage Detection
    print("\n  8C — Semantic Camouflage Scan:")
    if not PDFMINER_AVAILABLE:
        info("pdfminer not available — skipping semantic scan")
    else:
        try:
            full_text = extract_text(pdf_path)
        except Exception as e:
            info(f"Text extraction error: {e}")
            full_text = ""

        semantic_patterns = [
            (r'/\*\s*(system|ignore|prompt|bypass)', 'C-style comment injection'),
            (r'#\s*(system|ignore|prompt|bypass):', 'Hash-comment injection'),
            (r'<!--.*?(ignore|system|bypass).*?-->', 'HTML comment injection'),
            (r'(note|warning|important)\s*:\s*(ignore|forget|disregard)', 'Directive disguise'),
            (r'(todo|fixme|hack)\s*:.*?(ignore|bypass|system)', 'Code annotation injection'),
            (r'new (objective|task|goal|mission)\s*:', 'Goal replacement'),
            (r'(confidential|secret|hidden)\s*(instruction|prompt|command)', 'Authority claim'),
            (r'as an? (ai|llm|language model|gpt|claude|assistant)\b', 'AI identity targeting'),
            (r'your (training|instructions?|guidelines?) (say|tell|require|state)', 'Training manipulation'),
            (r'according to your (instructions?|guidelines?|training)', 'Authority spoofing'),
            (r'\[\[.*?(ignore|system|bypass|prompt).*?\]\]', 'Double-bracket injection'),
            (r'\{\{.*?(ignore|system|bypass|prompt).*?\}\}', 'Double-brace injection'),
            (r'<(system|instruction|prompt)>', 'XML tag injection'),
            (r'(urgent|critical|emergency)\s*:.*?(ignore|bypass|override)', 'Urgency injection'),
            (r'(admin|root|superuser|operator)\s*(instruction|command|override)', 'Privilege injection'),
        ]

        sem_hits = []
        for pat, label in semantic_patterns:
            matches = list(re.finditer(pat, full_text, re.IGNORECASE | re.DOTALL))
            if matches:
                for m in matches[:2]:
                    ctx = full_text[max(0, m.start()-40):m.end()+40].replace('\n', ' ')
                    sem_hits.append((label, ctx))

        if sem_hits:
            warn(f"{len(sem_hits)} semantic camouflage match(es):")
            for label, ctx in sem_hits:
                print(f"    [{label}] ...{ctx}...")
            results['semantic_camouflage'] = sem_hits
        else:
            ok("No semantic camouflage patterns found")

    results['clean'] = not any(k in results for k in ['xmp_injection', 'incremental_injection', 'semantic_camouflage'])
    return results

# ─────────────────────────────────────────
# Final Report
# ─────────────────────────────────────────

def final_report(pdf_path, r1, r2, r3, r4, r5, r6, r7, r8):
    section("FINAL REPORT")

    print(f"\n  File   : {pdf_path}")
    print(f"  Scanned: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()

    layers = [
        ("Layer 1 — Dangerous PDF Features",             r1.get('clean', True)),
        ("Layer 2 — Invisible Text",                     r2.get('clean', True)),
        ("Layer 3 — Injection Patterns",                 r3.get('clean', True)),
        ("Layer 4 — Encoding/Obfuscation",               r4.get('clean', True)),
        ("Layer 5 — Stream Content",                     r5.get('clean', True)),
        ("Layer 6 — Zero-Width Characters",              r6.get('clean', True)),
        ("Layer 7 — Annotation/ObjStm",                  r7.get('clean', True)),
        ("Layer 8 — XMP/Incremental Updates/Semantic",   r8.get('clean', True)),
    ]

    all_clean = True
    for name, clean in layers:
        if clean is None:
            info(f"{name}: SKIPPED")
        elif clean:
            ok(f"{name}: CLEAN")
        else:
            warn(f"{name}: ISSUES FOUND")
            all_clean = False

    print()
    if all_clean:
        print("  ✅  VERDICT: SAFE")
        print("      No injection indicators found across all layers.")
    else:
        issue_count = sum(1 for _, c in layers if c is False)
        clean_count = sum(1 for _, c in layers if c is True)
        if issue_count == 1 and clean_count >= 5:
            print("  🟡  VERDICT: SUSPICIOUS")
            print("      One layer flagged issues. Review the findings above.")
            print("      Could be false positives — review the false-positive guidance in SKILL.md")
        else:
            print("  🔴  VERDICT: DANGEROUS")
            print("      Multiple layers flagged issues. Do not use this PDF with AI systems.")

# ─────────────────────────────────────────
# Main
# ─────────────────────────────────────────

def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <pdf_file>")
        sys.exit(1)

    pdf_path = sys.argv[1]
    if not Path(pdf_path).exists():
        print(f"Error: File not found: {pdf_path}")
        sys.exit(1)

    print(f"\n{'='*60}")
    print(f"  GHOSTPROMPT")
    print(f"{'='*60}")
    print(f"  Target: {pdf_path}")
    print(f"  Size  : {Path(pdf_path).stat().st_size / 1024:.1f} KB")

    with open(pdf_path, 'rb') as f:
        raw_bytes = f.read()

    r1 = layer1_dangerous_features(raw_bytes)
    r2 = layer2_invisible_text(raw_bytes, pdf_path)
    r3 = layer3_injection_patterns(pdf_path)
    r4 = layer4_encoding(pdf_path, raw_bytes)
    r5 = layer5_streams(raw_bytes)
    r6 = layer6_zwc_detection(pdf_path, raw_bytes)
    r7 = layer7_annotation_objstm(raw_bytes, pdf_path)
    r8 = layer8_xmp_updates_semantic(raw_bytes, pdf_path)

    final_report(pdf_path, r1, r2, r3, r4, r5, r6, r7, r8)
    print()

if __name__ == '__main__':
    main()