Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions API/app/views/search/index.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@

<% @results.each do |result| %>
<div style="margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;">
<h3><a href="#"><%= result['title'] %></a></h3>
<p style="color: green; font-size: 14px;">Doc ID: <%= result['id'] %></p>
<p>This is a snippet for the document. In Phase 4, we will fetch real snippets from Postgres.</p>
<h3><a href="<%= result['url'] %>"><%= result['title'] %></a></h3>
<p style="color: green; font-size: 14px;"><%= result['url'] %></p>
<p><%= result['snippet'] %></p>
</div>
<% end %>
</div>
Expand Down
15 changes: 12 additions & 3 deletions cpp/indexer/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,17 @@ int main() {
std::string html_content = full_warc_record.substr(header_end + 4);

GumboOutput* output = gumbo_parse(html_content.c_str());
std::string plain_text = clean_text(output->root);
ExtractedContent content = extract_content(output->root);
std::string plain_text = content.text;
std::string title = content.title;
gumbo_destroy_output(&kGumboDefaultOptions, output);

// Generate Snippet (first 200 chars)
std::string snippet = plain_text.substr(0, 200);
// Basic cleanup of snippet (remove newlines)
std::replace(snippet.begin(), snippet.end(), '\n', ' ');
std::replace(snippet.begin(), snippet.end(), '\r', ' ');

// E. Tokenize & Index
std::vector<std::string> tokens = tokenize(plain_text);
std::set<std::string> unique_tokens(tokens.begin(), tokens.end()); // Simple boolean index for now
Expand Down Expand Up @@ -160,9 +168,10 @@ int main() {
}
}

// F. Update Doc Length
// F. Update Doc Length, Title, and Snippet
pqxx::work W2(*C);
W2.exec_params("UPDATE documents SET doc_length = $1 WHERE id = $2", tokens.size(), doc_id);
W2.exec_params("UPDATE documents SET doc_length = $1, title = $2, snippet = $3 WHERE id = $4",
tokens.size(), title, snippet, doc_id);
W2.commit();

std::cout << "Indexed " << tokens.size() << " words for Doc " << doc_id << std::endl;
Expand Down
30 changes: 21 additions & 9 deletions cpp/indexer/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,36 @@ std::string build_db_conn_str() {
}
}

std::string clean_text(GumboNode* node) {
void extract_content_recursive(GumboNode* node, ExtractedContent& content) {
if (node->type == GUMBO_NODE_TEXT) {
return std::string(node->v.text.text);
content.text.append(node->v.text.text);
} else if (node->type == GUMBO_NODE_ELEMENT &&
node->v.element.tag != GUMBO_TAG_SCRIPT &&
node->v.element.tag != GUMBO_TAG_STYLE) {
std::string contents = "";

if (node->v.element.tag == GUMBO_TAG_TITLE) {
if (node->v.element.children.length > 0) {
GumboNode* title_text = static_cast<GumboNode*>(node->v.element.children.data[0]);
if (title_text->type == GUMBO_NODE_TEXT) {
content.title = title_text->v.text.text;
}
}
}

GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
const std::string text = clean_text(static_cast<GumboNode*>(children->data[i]));
if (i != 0 && !text.empty()) {
contents.append(" ");
extract_content_recursive(static_cast<GumboNode*>(children->data[i]), content);
if (i != children->length - 1) {
content.text.append(" ");
}
contents.append(text);
}
return contents;
}
return "";
}

ExtractedContent extract_content(GumboNode* node) {
ExtractedContent content;
extract_content_recursive(node, content);
return content;
}

std::string decompress_gzip(const std::string& compressed_data) {
Expand Down
7 changes: 6 additions & 1 deletion cpp/indexer/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ std::string get_env_or_default(const char* var, const std::string& def);
std::string build_db_conn_str();

// Extract clean text from a Gumbo parse tree, ignoring script/style tags.
std::string clean_text(GumboNode* node);
// Also extracts the title if found.
struct ExtractedContent {
std::string text;
std::string title;
};
ExtractedContent extract_content(GumboNode* node);

// Decompress a gzip-compressed string.
std::string decompress_gzip(const std::string& compressed_data);
Expand Down
19 changes: 15 additions & 4 deletions cpp/indexer/tests/test_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@ void test_tokenize_special_chars() {
std::cout << "test_tokenize_special_chars passed" << std::endl;
}

// --- Test: clean_text ---
// --- Test: extract_content ---
void test_clean_text_simple() {
const char* html = "<html><body><p>Hello World</p></body></html>";
GumboOutput* output = gumbo_parse(html);
std::string text = indexer::clean_text(output->root);
std::string text = indexer::extract_content(output->root).text;
gumbo_destroy_output(&kGumboDefaultOptions, output);
// Text should contain "Hello World" (with possible surrounding whitespace)
ASSERT(text.find("Hello World") != std::string::npos, "Should extract 'Hello World'");
Expand All @@ -55,7 +55,7 @@ void test_clean_text_simple() {
void test_clean_text_ignores_script() {
const char* html = "<html><body><script>alert('evil')</script><p>Clean</p></body></html>";
GumboOutput* output = gumbo_parse(html);
std::string text = indexer::clean_text(output->root);
std::string text = indexer::extract_content(output->root).text;
gumbo_destroy_output(&kGumboDefaultOptions, output);
ASSERT(text.find("alert") == std::string::npos, "Should not contain script content");
ASSERT(text.find("Clean") != std::string::npos, "Should contain 'Clean'");
Expand All @@ -65,13 +65,23 @@ void test_clean_text_ignores_script() {
void test_clean_text_ignores_style() {
const char* html = "<html><head><style>body{color:red}</style></head><body><p>Styled</p></body></html>";
GumboOutput* output = gumbo_parse(html);
std::string text = indexer::clean_text(output->root);
std::string text = indexer::extract_content(output->root).text;
gumbo_destroy_output(&kGumboDefaultOptions, output);
ASSERT(text.find("color") == std::string::npos, "Should not contain style content");
ASSERT(text.find("Styled") != std::string::npos, "Should contain 'Styled'");
std::cout << "test_clean_text_ignores_style passed" << std::endl;
}

void test_extract_title() {
const char* html = "<html><head><title>My Title</title></head><body><p>Content</p></body></html>";
GumboOutput* output = gumbo_parse(html);
indexer::ExtractedContent content = indexer::extract_content(output->root);
gumbo_destroy_output(&kGumboDefaultOptions, output);
ASSERT(content.title == "My Title", "Should extract title");
ASSERT(content.text.find("Content") != std::string::npos, "Should extract content");
std::cout << "test_extract_title passed" << std::endl;
}

// --- Test: decompress_gzip ---
// Helper to compress a string with gzip
std::string compress_gzip(const std::string& data) {
Expand Down Expand Up @@ -131,6 +141,7 @@ int main() {
test_clean_text_simple();
test_clean_text_ignores_script();
test_clean_text_ignores_style();
test_extract_title();
test_decompress_gzip_basic();
test_decompress_gzip_empty();
std::cout << "All tests passed!" << std::endl;
Expand Down
4 changes: 4 additions & 0 deletions python/ranker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,9 @@ WORKDIR /app
COPY requirements.txt .
RUN pip3 install --no-cache-dir "Cython<3"
RUN pip3 install --no-cache-dir -r requirements.txt

COPY . .
# Build the C++ extension
RUN pip3 install .

CMD ["python3", "app.py"]
40 changes: 28 additions & 12 deletions python/ranker/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
import numpy as np
from collections import defaultdict

# Try to import rocksdict, fallback to mock if failed
# Try to import our custom C++ extension
try:
from rocksdict import Rdict, Options, AccessType
from rocksdb_client import RocksDBReader
ROCKSDB_AVAILABLE = True
except ImportError:
ROCKSDB_AVAILABLE = False
print("WARNING: rocksdict not available. Using Mock Index.")
print("WARNING: rocksdb_client extension not available. Using Mock Index.")

class Ranker:
def __init__(self):
Expand Down Expand Up @@ -42,7 +42,7 @@ def __init__(self):
if ROCKSDB_AVAILABLE:
try:
# We only need read access
self.index_db = Rdict(rocksdb_path, options=Options(), access_type=AccessType.read_only())
self.index_db = RocksDBReader(rocksdb_path)
print(f"Opened RocksDB at {rocksdb_path}")
except Exception as e:
print(f"Failed to open RocksDB: {e}")
Expand Down Expand Up @@ -205,29 +205,45 @@ def search(self, query, k=10):

# Fetch Metadata for top results
results = []
if self.db_conn:
if self.db_conn and sorted_docs:
try:
top_doc_ids = [doc_id for doc_id, _ in sorted_docs]
with self.db_conn.cursor() as cur:
# Fetch all metadata in one query
if len(top_doc_ids) == 1:
query = "SELECT id, url, title, snippet FROM documents WHERE id = %s"
params = (top_doc_ids[0],)
else:
query = "SELECT id, url, title, snippet FROM documents WHERE id IN %s"
params = (tuple(top_doc_ids),)

cur.execute(query, params)
rows = cur.fetchall()

# Create a lookup map
meta_map = {r[0]: {'url': r[1], 'title': r[2], 'snippet': r[3]} for r in rows}

for doc_id, score in sorted_docs:
cur.execute("SELECT url FROM documents WHERE id = %s", (doc_id,))
row = cur.fetchone()
if row:
if doc_id in meta_map:
meta = meta_map[doc_id]
results.append({
"id": doc_id,
"url": row[0],
"url": meta['url'],
"score": score,
"title": row[0] # Use URL as title for now
"title": meta['title'] if meta['title'] else meta['url'], # Fallback to URL if title is missing
"snippet": meta['snippet'] if meta['snippet'] else "No preview available."
})
except Exception as e:
print(f"Error fetching metadata: {e}")
else:
# Fallback if DB is down
# Fallback if DB is down or no results
for doc_id, score in sorted_docs:
results.append({
"id": doc_id,
"url": f"http://mock-url.com/{doc_id}",
"score": score,
"title": f"Mock Document {doc_id}"
"title": f"Mock Document {doc_id}",
"snippet": "This is a mock snippet because the DB is unavailable."
})

return results
Expand Down
2 changes: 1 addition & 1 deletion python/ranker/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Cython<3.0
flask
rocksdict
pybind11
psycopg2-binary
numpy
57 changes: 57 additions & 0 deletions python/ranker/rocksdb_client.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <rocksdb/db.h>
#include <string>
#include <stdexcept>

namespace py = pybind11;

class RocksDBReader {
rocksdb::DB* db;
bool is_open;
public:
RocksDBReader(const std::string& path) : db(nullptr), is_open(false) {
rocksdb::Options options;
// Use default comparator (Bytewise)
rocksdb::Status status = rocksdb::DB::OpenForReadOnly(options, path, &db);
if (!status.ok()) {
throw std::runtime_error("Failed to open RocksDB: " + status.ToString());
}
is_open = true;
}

~RocksDBReader() {
close();
}

py::object get(const py::bytes& key) {
if (!is_open) return py::none();

std::string key_str = key;
std::string value;
rocksdb::Status status = db->Get(rocksdb::ReadOptions(), key_str, &value);

if (status.IsNotFound()) {
return py::none();
}
if (!status.ok()) {
throw std::runtime_error("Error reading key: " + status.ToString());
}
return py::bytes(value);
}

void close() {
if (is_open && db) {
delete db;
db = nullptr;
is_open = false;
}
}
};

PYBIND11_MODULE(rocksdb_client, m) {
py::class_<RocksDBReader>(m, "RocksDBReader")
.def(py::init<const std::string&>())
.def("get", &RocksDBReader::get)
.def("close", &RocksDBReader::close);
}
18 changes: 18 additions & 0 deletions python/ranker/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from setuptools import setup, Extension
import pybind11

ext_modules = [
Extension(
"rocksdb_client",
["rocksdb_client.cpp"],
include_dirs=[pybind11.get_include()],
libraries=["rocksdb"],
language="c++",
extra_compile_args=["-std=c++17"],
),
]

setup(
name="rocksdb_client",
ext_modules=ext_modules,
)