Digvijay-x1 · Digvijay-x1 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/API/app/views/search/index.html.erb b/API/app/views/search/index.html.erb
@@ -13,9 +13,9 @@
 
     <% @results.each do |result| %>
       <div style="margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
-        <h3><a href="#"><%= result['title'] %></a></h3>
-        <p style="color: green; font-size: 14px;">Doc ID: <%= result['id'] %></p>
-        <p>This is a snippet for the document. In Phase 4, we will fetch real snippets from Postgres.</p>
+        <h3><a href="<%= result['url'] %>"><%= result['title'] %></a></h3>
+        <p style="color: green; font-size: 14px;"><%= result['url'] %></p>
+        <p><%= result['snippet'] %></p>
       </div>
     <% end %>
   </div>

diff --git a/cpp/indexer/src/main.cpp b/cpp/indexer/src/main.cpp
@@ -128,9 +128,17 @@ int main() {
             std::string html_content = full_warc_record.substr(header_end + 4);
 
             GumboOutput* output = gumbo_parse(html_content.c_str());
-            std::string plain_text = clean_text(output->root);
+            ExtractedContent content = extract_content(output->root);
+            std::string plain_text = content.text;
+            std::string title = content.title;
             gumbo_destroy_output(&kGumboDefaultOptions, output);
 
+            // Generate Snippet (first 200 chars)
+            std::string snippet = plain_text.substr(0, 200);
+            // Basic cleanup of snippet (remove newlines)
+            std::replace(snippet.begin(), snippet.end(), '\n', ' ');
+            std::replace(snippet.begin(), snippet.end(), '\r', ' ');
+
             // E. Tokenize & Index
             std::vector<std::string> tokens = tokenize(plain_text);
             std::set<std::string> unique_tokens(tokens.begin(), tokens.end()); // Simple boolean index for now
@@ -160,9 +168,10 @@ int main() {
                 }
             }
 
-            // F. Update Doc Length
+            // F. Update Doc Length, Title, and Snippet
             pqxx::work W2(*C);
-            W2.exec_params("UPDATE documents SET doc_length = $1 WHERE id = $2", tokens.size(), doc_id);
+            W2.exec_params("UPDATE documents SET doc_length = $1, title = $2, snippet = $3 WHERE id = $4", 
+                           tokens.size(), title, snippet, doc_id);
             W2.commit();
 
             std::cout << "Indexed " << tokens.size() << " words for Doc " << doc_id << std::endl;

diff --git a/cpp/indexer/src/utils.cpp b/cpp/indexer/src/utils.cpp
@@ -34,24 +34,36 @@ std::string build_db_conn_str() {
     }
 }
 
-std::string clean_text(GumboNode* node) {
+void extract_content_recursive(GumboNode* node, ExtractedContent& content) {
     if (node->type == GUMBO_NODE_TEXT) {
-        return std::string(node->v.text.text);
+        content.text.append(node->v.text.text);
     } else if (node->type == GUMBO_NODE_ELEMENT &&
                node->v.element.tag != GUMBO_TAG_SCRIPT &&
                node->v.element.tag != GUMBO_TAG_STYLE) {
-        std::string contents = "";
+
+        if (node->v.element.tag == GUMBO_TAG_TITLE) {
+            if (node->v.element.children.length > 0) {
+                GumboNode* title_text = static_cast<GumboNode*>(node->v.element.children.data[0]);
+                if (title_text->type == GUMBO_NODE_TEXT) {
+                    content.title = title_text->v.text.text;
+                }
+            }
+        }
+
         GumboVector* children = &node->v.element.children;
         for (unsigned int i = 0; i < children->length; ++i) {
-            const std::string text = clean_text(static_cast<GumboNode*>(children->data[i]));
-            if (i != 0 && !text.empty()) {
-                contents.append(" ");
+            extract_content_recursive(static_cast<GumboNode*>(children->data[i]), content);
+            if (i != children->length - 1) {
+                content.text.append(" ");
             }
-            contents.append(text);
         }
-        return contents;
     }
-    return "";
+}
+
+ExtractedContent extract_content(GumboNode* node) {
+    ExtractedContent content;
+    extract_content_recursive(node, content);
+    return content;
 }
 
 std::string decompress_gzip(const std::string& compressed_data) {

diff --git a/cpp/indexer/src/utils.hpp b/cpp/indexer/src/utils.hpp
@@ -14,7 +14,12 @@ std::string get_env_or_default(const char* var, const std::string& def);
 std::string build_db_conn_str();
 
 // Extract clean text from a Gumbo parse tree, ignoring script/style tags.
-std::string clean_text(GumboNode* node);
+// Also extracts the title if found.
+struct ExtractedContent {
+    std::string text;
+    std::string title;
+};
+ExtractedContent extract_content(GumboNode* node);
 
 // Decompress a gzip-compressed string.
 std::string decompress_gzip(const std::string& compressed_data);

diff --git a/cpp/indexer/tests/test_utils.cpp b/cpp/indexer/tests/test_utils.cpp
@@ -41,11 +41,11 @@ void test_tokenize_special_chars() {
     std::cout << "test_tokenize_special_chars passed" << std::endl;
 }
 
-// --- Test: clean_text ---
+// --- Test: extract_content ---
 void test_clean_text_simple() {
     const char* html = "<html><body><p>Hello World</p></body></html>";
     GumboOutput* output = gumbo_parse(html);
-    std::string text = indexer::clean_text(output->root);
+    std::string text = indexer::extract_content(output->root).text;
     gumbo_destroy_output(&kGumboDefaultOptions, output);
     // Text should contain "Hello World" (with possible surrounding whitespace)
     ASSERT(text.find("Hello World") != std::string::npos, "Should extract 'Hello World'");
@@ -55,7 +55,7 @@ void test_clean_text_simple() {
 void test_clean_text_ignores_script() {
     const char* html = "<html><body><script>alert('evil')</script><p>Clean</p></body></html>";
     GumboOutput* output = gumbo_parse(html);
-    std::string text = indexer::clean_text(output->root);
+    std::string text = indexer::extract_content(output->root).text;
     gumbo_destroy_output(&kGumboDefaultOptions, output);
     ASSERT(text.find("alert") == std::string::npos, "Should not contain script content");
     ASSERT(text.find("Clean") != std::string::npos, "Should contain 'Clean'");
@@ -65,13 +65,23 @@ void test_clean_text_ignores_script() {
 void test_clean_text_ignores_style() {
     const char* html = "<html><head><style>body{color:red}</style></head><body><p>Styled</p></body></html>";
     GumboOutput* output = gumbo_parse(html);
-    std::string text = indexer::clean_text(output->root);
+    std::string text = indexer::extract_content(output->root).text;
     gumbo_destroy_output(&kGumboDefaultOptions, output);
     ASSERT(text.find("color") == std::string::npos, "Should not contain style content");
     ASSERT(text.find("Styled") != std::string::npos, "Should contain 'Styled'");
     std::cout << "test_clean_text_ignores_style passed" << std::endl;
 }
 
+void test_extract_title() {
+    const char* html = "<html><head><title>My Title</title></head><body><p>Content</p></body></html>";
+    GumboOutput* output = gumbo_parse(html);
+    indexer::ExtractedContent content = indexer::extract_content(output->root);
+    gumbo_destroy_output(&kGumboDefaultOptions, output);
+    ASSERT(content.title == "My Title", "Should extract title");
+    ASSERT(content.text.find("Content") != std::string::npos, "Should extract content");
+    std::cout << "test_extract_title passed" << std::endl;
+}
+
 // --- Test: decompress_gzip ---
 // Helper to compress a string with gzip
 std::string compress_gzip(const std::string& data) {
@@ -131,6 +141,7 @@ int main() {
         test_clean_text_simple();
         test_clean_text_ignores_script();
         test_clean_text_ignores_style();
+        test_extract_title();
         test_decompress_gzip_basic();
         test_decompress_gzip_empty();
         std::cout << "All tests passed!" << std::endl;

diff --git a/python/ranker/Dockerfile b/python/ranker/Dockerfile
@@ -24,5 +24,9 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip3 install --no-cache-dir "Cython<3"
 RUN pip3 install --no-cache-dir -r requirements.txt
+
 COPY . .
+# Build the C++ extension
+RUN pip3 install .
+
 CMD ["python3", "app.py"]
diff --git a/python/ranker/engine.py b/python/ranker/engine.py
@@ -4,13 +4,13 @@
 import numpy as np
 from collections import defaultdict
 
-# Try to import rocksdict, fallback to mock if failed
+# Try to import our custom C++ extension
 try:
-    from rocksdict import Rdict, Options, AccessType
+    from rocksdb_client import RocksDBReader
     ROCKSDB_AVAILABLE = True
 except ImportError:
     ROCKSDB_AVAILABLE = False
-    print("WARNING: rocksdict not available. Using Mock Index.")
+    print("WARNING: rocksdb_client extension not available. Using Mock Index.")
 
 class Ranker:
     def __init__(self):
@@ -42,7 +42,7 @@ def __init__(self):
         if ROCKSDB_AVAILABLE:
             try:
                 # We only need read access
-                self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only())
+                self.index_db = RocksDBReader(rocksdb_path)
                 print(f"Opened RocksDB at {rocksdb_path}")
             except Exception as e:
                 print(f"Failed to open RocksDB: {e}")
@@ -205,29 +205,45 @@ def search(self, query, k=10):
 
         # Fetch Metadata for top results
         results = []
-        if self.db_conn:
+        if self.db_conn and sorted_docs:
             try:
+                top_doc_ids = [doc_id for doc_id, _ in sorted_docs]
                 with self.db_conn.cursor() as cur:
+                    # Fetch all metadata in one query
+                    if len(top_doc_ids) == 1:
+                        query = "SELECT id, url, title, snippet FROM documents WHERE id = %s"
+                        params = (top_doc_ids[0],)
+                    else:
+                        query = "SELECT id, url, title, snippet FROM documents WHERE id IN %s"
+                        params = (tuple(top_doc_ids),)
+
+                    cur.execute(query, params)
+                    rows = cur.fetchall()
+
+                    # Create a lookup map
+                    meta_map = {r[0]: {'url': r[1], 'title': r[2], 'snippet': r[3]} for r in rows}
+
                     for doc_id, score in sorted_docs:
-                        cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,))
-                        row = cur.fetchone()
-                        if row:
+                        if doc_id in meta_map:
+                            meta = meta_map[doc_id]
                             results.append({
                                 "id": doc_id,
-                                "url": row[0],
+                                "url": meta['url'],
                                 "score": score,
-                                "title": row[0] # Use URL as title for now
+                                "title": meta['title'] if meta['title'] else meta['url'], # Fallback to URL if title is missing
+                                "snippet": meta['snippet'] if meta['snippet'] else "No preview available."
                             })
             except Exception as e:
                 print(f"Error fetching metadata: {e}")
         else:
-            # Fallback if DB is down
+            # Fallback if DB is down or no results
             for doc_id, score in sorted_docs:
                 results.append({
                     "id": doc_id,
                     "url": f"http://mock-url.com/{doc_id}",
                     "score": score,
-                    "title": f"Mock Document {doc_id}"
+                    "title": f"Mock Document {doc_id}",
+                    "snippet": "This is a mock snippet because the DB is unavailable."
                 })
 
         return results

diff --git a/python/ranker/requirements.txt b/python/ranker/requirements.txt
@@ -1,5 +1,5 @@
 Cython<3.0
 flask
-rocksdict
+pybind11
 psycopg2-binary
 numpy
diff --git a/python/ranker/rocksdb_client.cpp b/python/ranker/rocksdb_client.cpp
@@ -0,0 +1,57 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <rocksdb/db.h>
+#include <string>
+#include <stdexcept>
+
+namespace py = pybind11;
+
+class RocksDBReader {
+    rocksdb::DB* db;
+    bool is_open;
+public:
+    RocksDBReader(const std::string& path) : db(nullptr), is_open(false) {
+        rocksdb::Options options;
+        // Use default comparator (Bytewise)
+        rocksdb::Status status = rocksdb::DB::OpenForReadOnly(options, path, &db);
+        if (!status.ok()) {
+            throw std::runtime_error("Failed to open RocksDB: " + status.ToString());
+        }
+        is_open = true;
+    }
+
+    ~RocksDBReader() {
+        close();
+    }
+
+    py::object get(const py::bytes& key) {
+        if (!is_open) return py::none();
+
+        std::string key_str = key;
+        std::string value;
+        rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key_str, &value);
+
+        if (status.IsNotFound()) {
+            return py::none();
+        }
+        if (!status.ok()) {
+            throw std::runtime_error("Error reading key: " + status.ToString());
+        }
+        return py::bytes(value);
+    }
+
+    void close() {
+        if (is_open && db) {
+            delete db;
+            db = nullptr;
+            is_open = false;
+        }
+    }
+};
+
+PYBIND11_MODULE(rocksdb_client, m) {
+    py::class_<RocksDBReader>(m, "RocksDBReader")
+        .def(py::init<const std::string&>())
+        .def("get", &RocksDBReader::get)
+        .def("close", &RocksDBReader::close);
+}
diff --git a/python/ranker/setup.py b/python/ranker/setup.py
@@ -0,0 +1,18 @@
+from setuptools import setup, Extension
+import pybind11
+
+ext_modules = [
+    Extension(
+        "rocksdb_client",
+        ["rocksdb_client.cpp"],
+        include_dirs=[pybind11.get_include()],
+        libraries=["rocksdb"],
+        language="c++",
+        extra_compile_args=["-std=c++17"],
+    ),
+]
+
+setup(
+    name="rocksdb_client",
+    ext_modules=ext_modules,
+)