TransitionalSQL/generate_extraction_scripts.py at main · depicted-candela/TransitionalSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import re

def sanitize_filename(title):
    # Remove any character that is not alphanumeric, underscore, hyphen, or dot
    s_title = re.sub(r'[^a-zA-Z0-9_.-]', '', title)
    # Replace multiple consecutive underscores with a single one
    s_title = re.sub(r'_{2,}', '_', s_title)
    # Remove leading/trailing underscores
    s_title = s_title.strip('_')
    # Truncate to a reasonable length to avoid filesystem limits, while maintaining readability
    max_len = 150
    if len(s_title) > max_len:
        s_title = s_title[:max_len]
    if not s_title: # Ensure filename is not empty after sanitization
        s_title = "unnamed_section"
    return s_title

def parse_metadata(metadata_path):
    bookmarks = []
    num_pages = 0
    with open(metadata_path, 'r') as f:
        lines = f.readlines()

    current_bookmark = {}
    for line in lines:
        line = line.strip()
        # The metadata.txt often contains non-bookmark info at the end, stop when we see a new InfoBegin
        if line.startswith('InfoBegin') and 'ModDate' in line:
            if current_bookmark: # Append any pending bookmark before breaking
                bookmarks.append(current_bookmark)
            break

        if line.startswith('NumberOfPages:'):
            num_pages = int(line.split(':')[1].strip())
        elif line.startswith('BookmarkBegin'):
            if current_bookmark and 'title' in current_bookmark and 'level' in current_bookmark and 'page' in current_bookmark:
                bookmarks.append(current_bookmark)
            current_bookmark = {}
        elif line.startswith('BookmarkTitle:'):
            current_bookmark['title'] = line.split('BookmarkTitle:')[1].strip()
        elif line.startswith('BookmarkLevel:'):
            current_bookmark['level'] = int(line.split('BookmarkLevel:')[1].strip())
        elif line.startswith('BookmarkPageNumber:'):
            current_bookmark['page'] = int(line.split('BookmarkPageNumber:')[1].strip())

    if current_bookmark and 'title' in current_bookmark and 'level' in current_bookmark and 'page' in current_bookmark:
        bookmarks.append(current_bookmark)

    return bookmarks, num_pages

def generate_extraction_script(book_dir_name, pdf_file_name, bookmarks, total_pages):
    script_content = []
    script_content.append("#!/bin/bash")
    script_content.append("set -e") # Exit immediately if a command exits with a non-zero status
    script_content.append("")
    script_content.append(f"PDF_FILE=\"{pdf_file_name}\"")
    script_content.append("OUTPUT_DIR=\"extracted_features\"")
    script_content.append("mkdir -p \"$OUTPUT_DIR\"")
    script_content.append("")
    script_content.append("echo \"Starting extraction for $PDF_FILE...\"")
    script_content.append("")

    # Define common meta-sections to exclude from being treated as content features
    # These are usually navigation aids or introductory sections that span few pages
    # and don't represent a 'feature' in terms of substantial content.
    excluded_content_titles = [
        "Contents", "List of Examples", "List of Figures", "List of Tables", "Preface"
    ]

    features_to_extract = []
    for bm in bookmarks:
        # We consider only Level 1 bookmarks as primary "features"
        # and explicitly exclude the meta-sections as defined.
        if bm['level'] == 1 and bm['title'] not in excluded_content_titles:
            features_to_extract.append(bm)

    # Sort features by page number to ensure correct processing order
    features_to_extract.sort(key=lambda x: x['page'])

    for i, feature in enumerate(features_to_extract):
        start_page = feature['page']
        if i + 1 < len(features_to_extract):
            # End page is one page before the next feature starts
            end_page = features_to_extract[i+1]['page'] - 1
        else:
            # Last feature ends at the total number of pages in the PDF
            end_page = total_pages

        # Ensure that end_page is not less than start_page,
        # which can happen for single-page sections or malformed bookmarks.
        if end_page < start_page:
            end_page = start_page

        sanitized_title = sanitize_filename(feature['title'])
        # Add page numbers to avoid naming conflicts and provide context
        output_filename = f"{sanitized_title}_p{start_page}-{end_page}.pdf"

        script_content.append(f"echo \"Extracting '{feature['title']}' (pages {start_page}-{end_page})...\"")
        script_content.append(f"pdftk \"$PDF_FILE\" cat {start_page}-{end_page} output \"$OUTPUT_DIR/{output_filename}\"")
        script_content.append("")

    script_content.append("echo \"Extraction complete for $PDF_FILE.\"")
    return "\n".join(script_content)

# Define the base path where the 'additional' directory is located
# Adjust this path if your script is in a different location relative to 'additional'
base_path = os.path.join(os.path.dirname(__file__), "books", "additional")
root_dir = os.path.abspath(base_path)

print(f"Scanning directory: {root_dir}")

for item in os.listdir(root_dir):
    book_dir_path = os.path.join(root_dir, item)
    if os.path.isdir(book_dir_path):
        pdf_name = ""
        metadata_path = os.path.join(book_dir_path, "metadata.txt")

        if not os.path.exists(metadata_path):
            print(f"Skipping {item}: metadata.txt not found. Please ensure it's in the directory.")
            continue

        # Find the PDF file in the directory
        for f in os.listdir(book_dir_path):
            if f.endswith(".pdf"):
                pdf_name = f
                break

        if not pdf_name:
            print(f"Skipping {item}: No PDF file found in the directory.")
            continue

        # We pass the PDF filename relative to the book_dir_path for the shell script
        # as the shell script will be executed from within that directory.

        print(f"Processing: {item} (PDF: {pdf_name})")
        bookmarks, total_pages = parse_metadata(metadata_path)
        extraction_script_content = generate_extraction_script(item, pdf_name, bookmarks, total_pages)

        script_filename = os.path.join(book_dir_path, f"extract_{item}.sh")
        with open(script_filename, 'w') as f:
            f.write(extraction_script_content)
        os.chmod(script_filename, 0o755) # Make the script executable
        print(f"Generated extraction script: {script_filename}")
        print("--------------------------------------------------")

print("\nAll extraction scripts generated.")
print("To extract features for a specific book, navigate to its directory and run its 'extract_*.sh' script.")
print("Example: cd books/additional/database-reference && ./extract_database-reference.sh")
print("This will create a new 'extracted_features' subdirectory containing the PDF features.")