diff --git a/API/app/views/search/index.html.erb b/API/app/views/search/index.html.erb index 6ddd229..76ff8ef 100644 --- a/API/app/views/search/index.html.erb +++ b/API/app/views/search/index.html.erb @@ -13,9 +13,9 @@ <% @results.each do |result| %>
Doc ID: <%= result['id'] %>
-This is a snippet for the document. In Phase 4, we will fetch real snippets from Postgres.
+<%= result['url'] %>
+<%= result['snippet'] %>
Hello World
"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); // Text should contain "Hello World" (with possible surrounding whitespace) ASSERT(text.find("Hello World") != std::string::npos, "Should extract 'Hello World'"); @@ -55,7 +55,7 @@ void test_clean_text_simple() { void test_clean_text_ignores_script() { const char* html = "Clean
"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); ASSERT(text.find("alert") == std::string::npos, "Should not contain script content"); ASSERT(text.find("Clean") != std::string::npos, "Should contain 'Clean'"); @@ -65,13 +65,23 @@ void test_clean_text_ignores_script() { void test_clean_text_ignores_style() { const char* html = "Styled
"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); ASSERT(text.find("color") == std::string::npos, "Should not contain style content"); ASSERT(text.find("Styled") != std::string::npos, "Should contain 'Styled'"); std::cout << "test_clean_text_ignores_style passed" << std::endl; } +void test_extract_title() { + const char* html = "Content
"; + GumboOutput* output = gumbo_parse(html); + indexer::ExtractedContent content = indexer::extract_content(output->root); + gumbo_destroy_output(&kGumboDefaultOptions, output); + ASSERT(content.title == "My Title", "Should extract title"); + ASSERT(content.text.find("Content") != std::string::npos, "Should extract content"); + std::cout << "test_extract_title passed" << std::endl; +} + // --- Test: decompress_gzip --- // Helper to compress a string with gzip std::string compress_gzip(const std::string& data) { @@ -131,6 +141,7 @@ int main() { test_clean_text_simple(); test_clean_text_ignores_script(); test_clean_text_ignores_style(); + test_extract_title(); test_decompress_gzip_basic(); test_decompress_gzip_empty(); std::cout << "All tests passed!" << std::endl; diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile index 9b94ae3..03ef0ec 100644 --- a/python/ranker/Dockerfile +++ b/python/ranker/Dockerfile @@ -24,5 +24,9 @@ WORKDIR /app COPY requirements.txt . RUN pip3 install --no-cache-dir "Cython<3" RUN pip3 install --no-cache-dir -r requirements.txt + COPY . . +# Build the C++ extension +RUN pip3 install . + CMD ["python3", "app.py"] \ No newline at end of file diff --git a/python/ranker/engine.py b/python/ranker/engine.py index ba275f9..e288ca0 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -4,13 +4,13 @@ import numpy as np from collections import defaultdict -# Try to import rocksdict, fallback to mock if failed +# Try to import our custom C++ extension try: - from rocksdict import Rdict, Options, AccessType + from rocksdb_client import RocksDBReader ROCKSDB_AVAILABLE = True except ImportError: ROCKSDB_AVAILABLE = False - print("WARNING: rocksdict not available. Using Mock Index.") + print("WARNING: rocksdb_client extension not available. Using Mock Index.") class Ranker: def __init__(self): @@ -42,7 +42,7 @@ def __init__(self): if ROCKSDB_AVAILABLE: try: # We only need read access - self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only()) + self.index_db = RocksDBReader(rocksdb_path) print(f"Opened RocksDB at {rocksdb_path}") except Exception as e: print(f"Failed to open RocksDB: {e}") @@ -205,29 +205,45 @@ def search(self, query, k=10): # Fetch Metadata for top results results = [] - if self.db_conn: + if self.db_conn and sorted_docs: try: + top_doc_ids = [doc_id for doc_id, _ in sorted_docs] with self.db_conn.cursor() as cur: + # Fetch all metadata in one query + if len(top_doc_ids) == 1: + query = "SELECT id, url, title, snippet FROM documents WHERE id = %s" + params = (top_doc_ids[0],) + else: + query = "SELECT id, url, title, snippet FROM documents WHERE id IN %s" + params = (tuple(top_doc_ids),) + + cur.execute(query, params) + rows = cur.fetchall() + + # Create a lookup map + meta_map = {r[0]: {'url': r[1], 'title': r[2], 'snippet': r[3]} for r in rows} + for doc_id, score in sorted_docs: - cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,)) - row = cur.fetchone() - if row: + if doc_id in meta_map: + meta = meta_map[doc_id] results.append({ "id": doc_id, - "url": row[0], + "url": meta['url'], "score": score, - "title": row[0] # Use URL as title for now + "title": meta['title'] if meta['title'] else meta['url'], # Fallback to URL if title is missing + "snippet": meta['snippet'] if meta['snippet'] else "No preview available." }) except Exception as e: print(f"Error fetching metadata: {e}") else: - # Fallback if DB is down + # Fallback if DB is down or no results for doc_id, score in sorted_docs: results.append({ "id": doc_id, "url": f"http://mock-url.com/{doc_id}", "score": score, - "title": f"Mock Document {doc_id}" + "title": f"Mock Document {doc_id}", + "snippet": "This is a mock snippet because the DB is unavailable." }) return results diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index 77e6c3c..b8b9d73 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,5 +1,5 @@ Cython<3.0 flask -rocksdict +pybind11 psycopg2-binary numpy \ No newline at end of file diff --git a/python/ranker/rocksdb_client.cpp b/python/ranker/rocksdb_client.cpp new file mode 100644 index 0000000..cbf771e --- /dev/null +++ b/python/ranker/rocksdb_client.cpp @@ -0,0 +1,57 @@ +#include