diff --git a/API/app/views/search/index.html.erb b/API/app/views/search/index.html.erb index 6ddd229..76ff8ef 100644 --- a/API/app/views/search/index.html.erb +++ b/API/app/views/search/index.html.erb @@ -13,9 +13,9 @@ <% @results.each do |result| %>
-

<%= result['title'] %>

-

Doc ID: <%= result['id'] %>

-

This is a snippet for the document. In Phase 4, we will fetch real snippets from Postgres.

+

<%= result['title'] %>

+

<%= result['url'] %>

+

<%= result['snippet'] %>

<% end %> diff --git a/cpp/indexer/src/main.cpp b/cpp/indexer/src/main.cpp index e8b02c8..faf9f34 100644 --- a/cpp/indexer/src/main.cpp +++ b/cpp/indexer/src/main.cpp @@ -128,9 +128,17 @@ int main() { std::string html_content = full_warc_record.substr(header_end + 4); GumboOutput* output = gumbo_parse(html_content.c_str()); - std::string plain_text = clean_text(output->root); + ExtractedContent content = extract_content(output->root); + std::string plain_text = content.text; + std::string title = content.title; gumbo_destroy_output(&kGumboDefaultOptions, output); + // Generate Snippet (first 200 chars) + std::string snippet = plain_text.substr(0, 200); + // Basic cleanup of snippet (remove newlines) + std::replace(snippet.begin(), snippet.end(), '\n', ' '); + std::replace(snippet.begin(), snippet.end(), '\r', ' '); + // E. Tokenize & Index std::vector tokens = tokenize(plain_text); std::set unique_tokens(tokens.begin(), tokens.end()); // Simple boolean index for now @@ -160,9 +168,10 @@ int main() { } } - // F. Update Doc Length + // F. Update Doc Length, Title, and Snippet pqxx::work W2(*C); - W2.exec_params("UPDATE documents SET doc_length = $1 WHERE id = $2", tokens.size(), doc_id); + W2.exec_params("UPDATE documents SET doc_length = $1, title = $2, snippet = $3 WHERE id = $4", + tokens.size(), title, snippet, doc_id); W2.commit(); std::cout << "Indexed " << tokens.size() << " words for Doc " << doc_id << std::endl; diff --git a/cpp/indexer/src/utils.cpp b/cpp/indexer/src/utils.cpp index 0aa2eb9..c0702ae 100644 --- a/cpp/indexer/src/utils.cpp +++ b/cpp/indexer/src/utils.cpp @@ -34,24 +34,36 @@ std::string build_db_conn_str() { } } -std::string clean_text(GumboNode* node) { +void extract_content_recursive(GumboNode* node, ExtractedContent& content) { if (node->type == GUMBO_NODE_TEXT) { - return std::string(node->v.text.text); + content.text.append(node->v.text.text); } else if (node->type == GUMBO_NODE_ELEMENT && node->v.element.tag != GUMBO_TAG_SCRIPT && node->v.element.tag != GUMBO_TAG_STYLE) { - std::string contents = ""; + + if (node->v.element.tag == GUMBO_TAG_TITLE) { + if (node->v.element.children.length > 0) { + GumboNode* title_text = static_cast(node->v.element.children.data[0]); + if (title_text->type == GUMBO_NODE_TEXT) { + content.title = title_text->v.text.text; + } + } + } + GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { - const std::string text = clean_text(static_cast(children->data[i])); - if (i != 0 && !text.empty()) { - contents.append(" "); + extract_content_recursive(static_cast(children->data[i]), content); + if (i != children->length - 1) { + content.text.append(" "); } - contents.append(text); } - return contents; } - return ""; +} + +ExtractedContent extract_content(GumboNode* node) { + ExtractedContent content; + extract_content_recursive(node, content); + return content; } std::string decompress_gzip(const std::string& compressed_data) { diff --git a/cpp/indexer/src/utils.hpp b/cpp/indexer/src/utils.hpp index f09ac9f..a92e399 100644 --- a/cpp/indexer/src/utils.hpp +++ b/cpp/indexer/src/utils.hpp @@ -14,7 +14,12 @@ std::string get_env_or_default(const char* var, const std::string& def); std::string build_db_conn_str(); // Extract clean text from a Gumbo parse tree, ignoring script/style tags. -std::string clean_text(GumboNode* node); +// Also extracts the title if found. +struct ExtractedContent { + std::string text; + std::string title; +}; +ExtractedContent extract_content(GumboNode* node); // Decompress a gzip-compressed string. std::string decompress_gzip(const std::string& compressed_data); diff --git a/cpp/indexer/tests/test_utils.cpp b/cpp/indexer/tests/test_utils.cpp index f041efa..e3a3e8b 100644 --- a/cpp/indexer/tests/test_utils.cpp +++ b/cpp/indexer/tests/test_utils.cpp @@ -41,11 +41,11 @@ void test_tokenize_special_chars() { std::cout << "test_tokenize_special_chars passed" << std::endl; } -// --- Test: clean_text --- +// --- Test: extract_content --- void test_clean_text_simple() { const char* html = "

Hello World

"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); // Text should contain "Hello World" (with possible surrounding whitespace) ASSERT(text.find("Hello World") != std::string::npos, "Should extract 'Hello World'"); @@ -55,7 +55,7 @@ void test_clean_text_simple() { void test_clean_text_ignores_script() { const char* html = "

Clean

"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); ASSERT(text.find("alert") == std::string::npos, "Should not contain script content"); ASSERT(text.find("Clean") != std::string::npos, "Should contain 'Clean'"); @@ -65,13 +65,23 @@ void test_clean_text_ignores_script() { void test_clean_text_ignores_style() { const char* html = "

Styled

"; GumboOutput* output = gumbo_parse(html); - std::string text = indexer::clean_text(output->root); + std::string text = indexer::extract_content(output->root).text; gumbo_destroy_output(&kGumboDefaultOptions, output); ASSERT(text.find("color") == std::string::npos, "Should not contain style content"); ASSERT(text.find("Styled") != std::string::npos, "Should contain 'Styled'"); std::cout << "test_clean_text_ignores_style passed" << std::endl; } +void test_extract_title() { + const char* html = "My Title

Content

"; + GumboOutput* output = gumbo_parse(html); + indexer::ExtractedContent content = indexer::extract_content(output->root); + gumbo_destroy_output(&kGumboDefaultOptions, output); + ASSERT(content.title == "My Title", "Should extract title"); + ASSERT(content.text.find("Content") != std::string::npos, "Should extract content"); + std::cout << "test_extract_title passed" << std::endl; +} + // --- Test: decompress_gzip --- // Helper to compress a string with gzip std::string compress_gzip(const std::string& data) { @@ -131,6 +141,7 @@ int main() { test_clean_text_simple(); test_clean_text_ignores_script(); test_clean_text_ignores_style(); + test_extract_title(); test_decompress_gzip_basic(); test_decompress_gzip_empty(); std::cout << "All tests passed!" << std::endl; diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile index 9b94ae3..03ef0ec 100644 --- a/python/ranker/Dockerfile +++ b/python/ranker/Dockerfile @@ -24,5 +24,9 @@ WORKDIR /app COPY requirements.txt . RUN pip3 install --no-cache-dir "Cython<3" RUN pip3 install --no-cache-dir -r requirements.txt + COPY . . +# Build the C++ extension +RUN pip3 install . + CMD ["python3", "app.py"] \ No newline at end of file diff --git a/python/ranker/engine.py b/python/ranker/engine.py index ba275f9..e288ca0 100644 --- a/python/ranker/engine.py +++ b/python/ranker/engine.py @@ -4,13 +4,13 @@ import numpy as np from collections import defaultdict -# Try to import rocksdict, fallback to mock if failed +# Try to import our custom C++ extension try: - from rocksdict import Rdict, Options, AccessType + from rocksdb_client import RocksDBReader ROCKSDB_AVAILABLE = True except ImportError: ROCKSDB_AVAILABLE = False - print("WARNING: rocksdict not available. Using Mock Index.") + print("WARNING: rocksdb_client extension not available. Using Mock Index.") class Ranker: def __init__(self): @@ -42,7 +42,7 @@ def __init__(self): if ROCKSDB_AVAILABLE: try: # We only need read access - self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only()) + self.index_db = RocksDBReader(rocksdb_path) print(f"Opened RocksDB at {rocksdb_path}") except Exception as e: print(f"Failed to open RocksDB: {e}") @@ -205,29 +205,45 @@ def search(self, query, k=10): # Fetch Metadata for top results results = [] - if self.db_conn: + if self.db_conn and sorted_docs: try: + top_doc_ids = [doc_id for doc_id, _ in sorted_docs] with self.db_conn.cursor() as cur: + # Fetch all metadata in one query + if len(top_doc_ids) == 1: + query = "SELECT id, url, title, snippet FROM documents WHERE id = %s" + params = (top_doc_ids[0],) + else: + query = "SELECT id, url, title, snippet FROM documents WHERE id IN %s" + params = (tuple(top_doc_ids),) + + cur.execute(query, params) + rows = cur.fetchall() + + # Create a lookup map + meta_map = {r[0]: {'url': r[1], 'title': r[2], 'snippet': r[3]} for r in rows} + for doc_id, score in sorted_docs: - cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,)) - row = cur.fetchone() - if row: + if doc_id in meta_map: + meta = meta_map[doc_id] results.append({ "id": doc_id, - "url": row[0], + "url": meta['url'], "score": score, - "title": row[0] # Use URL as title for now + "title": meta['title'] if meta['title'] else meta['url'], # Fallback to URL if title is missing + "snippet": meta['snippet'] if meta['snippet'] else "No preview available." }) except Exception as e: print(f"Error fetching metadata: {e}") else: - # Fallback if DB is down + # Fallback if DB is down or no results for doc_id, score in sorted_docs: results.append({ "id": doc_id, "url": f"http://mock-url.com/{doc_id}", "score": score, - "title": f"Mock Document {doc_id}" + "title": f"Mock Document {doc_id}", + "snippet": "This is a mock snippet because the DB is unavailable." }) return results diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt index 77e6c3c..b8b9d73 100644 --- a/python/ranker/requirements.txt +++ b/python/ranker/requirements.txt @@ -1,5 +1,5 @@ Cython<3.0 flask -rocksdict +pybind11 psycopg2-binary numpy \ No newline at end of file diff --git a/python/ranker/rocksdb_client.cpp b/python/ranker/rocksdb_client.cpp new file mode 100644 index 0000000..cbf771e --- /dev/null +++ b/python/ranker/rocksdb_client.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include + +namespace py = pybind11; + +class RocksDBReader { + rocksdb::DB* db; + bool is_open; +public: + RocksDBReader(const std::string& path) : db(nullptr), is_open(false) { + rocksdb::Options options; + // Use default comparator (Bytewise) + rocksdb::Status status = rocksdb::DB::OpenForReadOnly(options, path, &db); + if (!status.ok()) { + throw std::runtime_error("Failed to open RocksDB: " + status.ToString()); + } + is_open = true; + } + + ~RocksDBReader() { + close(); + } + + py::object get(const py::bytes& key) { + if (!is_open) return py::none(); + + std::string key_str = key; + std::string value; + rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key_str, &value); + + if (status.IsNotFound()) { + return py::none(); + } + if (!status.ok()) { + throw std::runtime_error("Error reading key: " + status.ToString()); + } + return py::bytes(value); + } + + void close() { + if (is_open && db) { + delete db; + db = nullptr; + is_open = false; + } + } +}; + +PYBIND11_MODULE(rocksdb_client, m) { + py::class_(m, "RocksDBReader") + .def(py::init()) + .def("get", &RocksDBReader::get) + .def("close", &RocksDBReader::close); +} diff --git a/python/ranker/setup.py b/python/ranker/setup.py new file mode 100644 index 0000000..3face60 --- /dev/null +++ b/python/ranker/setup.py @@ -0,0 +1,18 @@ +from setuptools import setup, Extension +import pybind11 + +ext_modules = [ + Extension( + "rocksdb_client", + ["rocksdb_client.cpp"], + include_dirs=[pybind11.get_include()], + libraries=["rocksdb"], + language="c++", + extra_compile_args=["-std=c++17"], + ), +] + +setup( + name="rocksdb_client", + ext_modules=ext_modules, +)