Cortex/ingest.py at main · Ranu92/Cortex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Run once to build the FAISS vector index from sample_docs/*.txt
Usage: python ingest.py
"""

import glob
import os

import docx2txt
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from pypdf import PdfReader

load_dotenv()

DOCS_DIR = os.path.join(os.path.dirname(__file__), "sample_docs")
INDEX_DIR = os.path.join(os.path.dirname(__file__), "faiss_index")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
SUPPORTED_EXTS = (".txt", ".pdf", ".docx")


def read_file(path: str) -> str:
    """Extract plain text from a .txt, .pdf, or .docx file."""
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    if ext == ".pdf":
        reader = PdfReader(path)
        return "\n\n".join((page.extract_text() or "") for page in reader.pages)
    if ext == ".docx":
        return docx2txt.process(path) or ""
    return ""


def chunk_text(text: str, source: str) -> list[Document]:
    """Split text into ~500-char chunks on paragraph boundaries."""
    chunks = []
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]

    current = ""
    for para in paragraphs:
        if len(current) + len(para) + 2 <= CHUNK_SIZE:
            current = (current + "\n\n" + para).strip()
        else:
            if current:
                chunks.append(Document(page_content=current, metadata={"source": source}))
            # If single paragraph is already > CHUNK_SIZE, keep it as-is
            current = para

    if current:
        chunks.append(Document(page_content=current, metadata={"source": source}))

    return chunks


def load_and_split() -> list[Document]:
    """Load all supported files and split them into chunks."""
    chunks = []
    for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*"))):
        if os.path.splitext(path)[1].lower() not in SUPPORTED_EXTS:
            continue
        source = os.path.basename(path)
        text = read_file(path)
        if not text.strip():
            print(f"  Skipped (no extractable text): {source}")
            continue
        file_chunks = chunk_text(text, source)
        print(f"  {source}: {len(file_chunks)} chunks")
        chunks.extend(file_chunks)

    return chunks


def main():
    print("Loading and splitting documents...")
    chunks = load_and_split()
    print(f"  Split into {len(chunks)} chunks")

    print("Embedding and building FAISS index...")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    vectorstore.save_local(INDEX_DIR)

    print(f"Done! Index saved to {INDEX_DIR}/")
    print(f"  Total chunks indexed: {len(chunks)}")


if __name__ == "__main__":
    main()