-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest.py
More file actions
93 lines (73 loc) · 2.83 KB
/
ingest.py
File metadata and controls
93 lines (73 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
Run once to build the FAISS vector index from sample_docs/*.txt
Usage: python ingest.py
"""
import glob
import os
import docx2txt
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from pypdf import PdfReader
load_dotenv()
DOCS_DIR = os.path.join(os.path.dirname(__file__), "sample_docs")
INDEX_DIR = os.path.join(os.path.dirname(__file__), "faiss_index")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
SUPPORTED_EXTS = (".txt", ".pdf", ".docx")
def read_file(path: str) -> str:
"""Extract plain text from a .txt, .pdf, or .docx file."""
ext = os.path.splitext(path)[1].lower()
if ext == ".txt":
with open(path, "r", encoding="utf-8") as f:
return f.read()
if ext == ".pdf":
reader = PdfReader(path)
return "\n\n".join((page.extract_text() or "") for page in reader.pages)
if ext == ".docx":
return docx2txt.process(path) or ""
return ""
def chunk_text(text: str, source: str) -> list[Document]:
"""Split text into ~500-char chunks on paragraph boundaries."""
chunks = []
paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
current = ""
for para in paragraphs:
if len(current) + len(para) + 2 <= CHUNK_SIZE:
current = (current + "\n\n" + para).strip()
else:
if current:
chunks.append(Document(page_content=current, metadata={"source": source}))
# If single paragraph is already > CHUNK_SIZE, keep it as-is
current = para
if current:
chunks.append(Document(page_content=current, metadata={"source": source}))
return chunks
def load_and_split() -> list[Document]:
"""Load all supported files and split them into chunks."""
chunks = []
for path in sorted(glob.glob(os.path.join(DOCS_DIR, "*"))):
if os.path.splitext(path)[1].lower() not in SUPPORTED_EXTS:
continue
source = os.path.basename(path)
text = read_file(path)
if not text.strip():
print(f" Skipped (no extractable text): {source}")
continue
file_chunks = chunk_text(text, source)
print(f" {source}: {len(file_chunks)} chunks")
chunks.extend(file_chunks)
return chunks
def main():
print("Loading and splitting documents...")
chunks = load_and_split()
print(f" Split into {len(chunks)} chunks")
print("Embedding and building FAISS index...")
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local(INDEX_DIR)
print(f"Done! Index saved to {INDEX_DIR}/")
print(f" Total chunks indexed: {len(chunks)}")
if __name__ == "__main__":
main()