CoveritLabs · YoussefMehany · Apr 4, 2026 · Apr 4, 2026 · Apr 5, 2026 · Apr 6, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,32 @@
+# Neo4j
+NEO4J_URI=bolt://localhost:7687
+NEO4J_USER=neo4j
+NEO4J_PASSWORD=password
+
+# Crawler behavior
+HEADLESS=true
+TIMEOUT_MS=3000
+MAX_STATES=1000
+MAX_TRANSITIONS=5000
+MAX_ELEMENTS_PER_STATE=3
+USE_SEMANTIC_DIVERSITY=true
+SEMANTIC_DIVERSITY_THRESHOLD=0.90
+SEMANTIC_UNCERTAINTY_MARGIN=0.05
+SEMANTIC_MAX_BANK_SIZE=1000
+SEMANTIC_ARTIFACT_DIR=src/models/semantic
+MAX_SELECT_OPTIONS_PER_ELEMENT=3
+MAX_ACTION_REPEATS_PER_URL=2
+ACTION_RETRY_COUNT=1
+REPLAY_RETRY_COUNT=1
+POPUP_TIMEOUT_MS=3000
+DOM_QUIET_MS=400
+DOM_SETTLE_TIMEOUT_MS=3000
+USE_DOM_QUIESCENCE=true
+PAGE_LOAD_STATE=networkidle
+CLICK_NON_HTTP_LINKS=false
+DEFER_DESTRUCTIVE_ACTIONS=true
+DESTRUCTIVE_KEYWORDS="logout,log out,sign out,delete,remove,unsubscribe,cancel,checkout,pay,purchase,order,place order,reset,deactivate,terminate,drop,empty cart,clear cart"
+
+# Optional services
+DATABASE_URL=
+REDIS_URL=
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,44 @@
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+
+# Dependencies
+node_modules
+.pnp
+.pnp.js
+
+# Build output
+dist
+*.egg-info
+
+# Test / coverage
+coverage
+
+# Editor directories and files
+.vscode/*
+!.vscode/extensions.json
+.idea
+.DS_Store
+*.suo
+*.ntvs*
+*.njsproj
+*.sln
+*.sw?
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Environment variables
+.env
+.env.*
+!.env.example
+
+**/__pycache__/
+/src/generated/prisma
+data/semantic_pipeline/
+data/semantic_source_archive/
+src/models/semantic/*.joblib
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,9 @@
+repos:
+  - repo: local
+    hooks:
+      - id: ruff
+        name: ruff (format + check)
+        entry: uv run python scripts/precommit_ruff.py
+        language: system
+        pass_filenames: true
+        types: [python]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,40 @@
+COVERIT LABS PROPRIETARY SOFTWARE LICENSE
+
+Copyright (c) 2026 CoverIt Labs. All Rights Reserved.
+
+NOTICE: This software and its source code are the exclusive property of
+CoverIt Labs and constitute confidential and proprietary trade secrets.
+
+RESTRICTIONS:
+1. No part of this software, including source code, documentation, or
+   associated materials, may be copied, reproduced, modified, translated,
+   adapted, distributed, transmitted, displayed, performed, published,
+   licensed, transferred, sold, or used to create derivative works — in
+   whole or in part — by any means or in any form, without the prior
+   explicit written consent of CoverIt Labs.
+
+2. Access to this software is granted solely to authorized personnel and
+   contractors of CoverIt Labs for the purpose of developing, testing, and
+   maintaining CoverIt products and services.
+
+3. Authorized users must not disclose any part of this software or its
+   contents to any third party without the prior written approval of
+   CoverIt Labs.
+
+4. All intellectual property rights in and to this software, including
+   patents, copyrights, trademarks, and trade secrets, are and shall
+   remain the exclusive property of CoverIt Labs.
+
+DISCLAIMER:
+THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. COVERIT
+LABS DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. IN NO EVENT SHALL COVERIT LABS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF OR IN CONNECTION WITH THE USE OF THIS SOFTWARE.
+
+Any violation of these terms will result in immediate termination of
+access rights and may be subject to civil and criminal penalties under
+applicable law.
+
+For licensing inquiries, contact the main contributor of this repository.
diff --git a/example_flows.py b/example_flows.py
@@ -0,0 +1,58 @@
+"""
+example_flows.py
+----------------
+Drop-in example showing how to call find_flows against a live session.
+Run from the repo root:
+
+    python example_flows.py
+"""
+
+import asyncio
+import logging
+
+from src.config import config
+from src.graph import create_graph
+from src.graph.flow_finder import find_flows
+
+logging.basicConfig(level=logging.INFO)
+
+SESSION_ID = "5adeca26-d6e3-41a7-b528-ba308614444b"
+TARGET_HASH = "97f69d333c60b1d384fdc968a8bc0f8a0669fdcb76d105d79fb7094232f67bdd"
+
+
+async def main() -> None:
+    client, graph = await create_graph(
+        config.NEO4J_URI,
+        config.NEO4J_USER,
+        config.NEO4J_PASSWORD,
+    )
+
+    try:
+        flows = await find_flows(
+            graph,
+            session_id=SESSION_ID,
+            target_hash=TARGET_HASH,
+            max_paths=50,
+            max_depth=20,
+        )
+
+        print(f"\nFound {len(flows)} flow(s) to {TARGET_HASH}\n")
+
+        for i, flow in enumerate(flows, 1):
+            clip_note = f"clipped at checkpoint {flow.checkpoint}" if flow.is_clipped else "from root"
+            print(f"── Flow {i}  ({len(flow.clipped_path)} steps, {clip_note}) ──")
+
+            for step in flow.clipped_path:
+                if step.transition is None:
+                    print(f"  START  {step.state_hash}")
+                else:
+                    t = step.transition
+                    print(f"  → [{t.get('action_type', '?')}]  {t.get('action_description') or t.get('locator_value', '')}    ▶  {step.state_hash}")
+            print()
+
+    finally:
+        await client.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/example_usage.py b/example_usage.py
@@ -0,0 +1,56 @@
+import asyncio
+import logging
+import os
+import uuid
+
+from src.config import config
+from src.crawler.session import CrawlSession
+from src.graph import create_graph
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://tryscrapeme.com/"
+QUOTES = "https://quotes.toscrape.com/"
+BOOKS = "https://books.toscrape.com/"
+OTHER_URL = "https://en.wikipedia.org/wiki/Main_Page"
+X = "https://the-internet.herokuapp.com/challenging_dom"
+WEBSITE_1 = "file:///D:/crawler_test_website/nexus_commerce/index.html"
+
+
+async def main():
+    logger.info("Starting CoverIt Crawler...")
+    logger.info("Connecting to Neo4j...")
+    client, graph = await create_graph(
+        config.NEO4J_URI,
+        config.NEO4J_USER,
+        config.NEO4J_PASSWORD,
+    )
+
+    try:
+        crawl_session_id = str(uuid.uuid4())
+        config_path = os.path.join(os.path.dirname(__file__), "src", "configs", "input_defaults.json")
+        session = CrawlSession(
+            base_url=QUOTES,
+            graph_builder=graph,
+            config_path=config_path,
+            session_id=crawl_session_id,
+            headless=config.HEADLESS,
+        )
+
+        logger.info("Starting crawl...")
+        await session.run_crawl()
+
+        logger.info("\nCrawler execution successful!")
+
+    except Exception as e:
+        logger.error(f"Error: {e}", exc_info=True)
+
+    finally:
+        logger.info("Cleaning up...")
+        await client.close()
+        logger.info("Done!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/find_flows_script.py b/find_flows_script.py
@@ -0,0 +1,125 @@
+"""
+Run this from the coverit-crawler root after populating Neo4j via example_usage.py.
+this script is to call find_all_flows() for a given session and print the results, for testing/debugging purposes
+it uses the neo4j graph that is already populated by the crawler, so it doesn't require running the full crawl flow
+
+Usage:
+    python find_flows_script.py <session_id>
+
+What it checks:
+    - find_all_flows() runs without error
+    - Every state with a flow has at least one path
+    - No path exceeds max_depth
+    - No path contains duplicate state hashes (no loops)
+    - Checkpoint reset works: clipped paths don't contain states from before the checkpoint
+    - Serialization produces valid JSON
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+
+
+async def main(session_id: str) -> None:
+    from src.config import config
+    from src.graph.factory import create_graph
+    from src.graph.flow_finder import _serialize_all_flows, find_all_flows
+
+    print("\nConnecting to Neo4j...")
+    client, graph_repo = await create_graph(
+        config.NEO4J_URI,
+        config.NEO4J_USER,
+        config.NEO4J_PASSWORD,
+    )
+
+    try:
+        print(f"Running find_all_flows for session: {session_id}\n")
+        all_flows = await find_all_flows(
+            graph_repo,
+            session_id=session_id,
+            max_paths_per_state=3,
+            max_depth=20,
+        )
+
+        if not all_flows:
+            print("ERROR: No flows returned — is Neo4j populated for this session?")
+            return
+
+        # ----------------------------------------------------------------
+        # Basic stats
+        # ----------------------------------------------------------------
+        total_flows = sum(len(flows) for flows in all_flows.values())
+        path_lengths = [len(flows) for flows in all_flows.values()]
+
+        print(f"States with flows : {len(all_flows)}")
+        print(f"Total flows       : {total_flows}")
+        print(f"Min path length   : {min(path_lengths)}")
+        print(f"Max path length   : {max(path_lengths)}")
+        print(f"Avg path length   : {sum(path_lengths) / len(path_lengths):.1f}")
+
+        # ----------------------------------------------------------------
+        # Correctness checks
+        # ----------------------------------------------------------------
+        errors: list[str] = []
+
+        for state_hash, flows in all_flows.items():
+            if not flows:
+                errors.append(f"State {state_hash[:8]} has no flows")
+                continue
+
+            for flow in flows:
+                if len(flow.transition_refs) > 20:
+                    errors.append(f"State {state_hash[:8]} has a long path ({len(flow.transition_refs)} steps)")
+
+                if len(set(flow.transition_refs)) != len(flow.transition_refs):
+                    errors.append(f"State {state_hash[:8]} has duplicate states in its path (loop detected)")
+
+                if flow.checkpoint_hash and flow.checkpoint_hash in flow.transition_refs:
+                    errors.append(f"State {state_hash[:8]} has checkpoint {flow.checkpoint_hash[:8]} in its path (checkpoint reset failed)")
+
+        # ----------------------------------------------------------------
+        # Serialization check
+        # ----------------------------------------------------------------
+        try:
+            serialized = _serialize_all_flows(all_flows)
+            json_str = json.dumps(serialized)
+            reparsed = json.loads(json_str)
+            # save the json, even if file doesntt exist, for inspection
+            with open("all_flows.json", "w", encoding="utf-8") as f:
+                f.write(json_str)
+            assert len(reparsed) == len(all_flows), "Serialized state count mismatch"
+            print(f"\nSerialized payload size: {len(json_str) / 1024:.1f} KB")
+        except Exception as e:
+            errors.append(f"Serialization failed: {e}")
+
+        # ----------------------------------------------------------------
+        # Sample output print
+        # ----------------------------------------------------------------
+        print("\n---all flows for all states ---")
+        for state_hash, flows in list(all_flows.items()):
+            for flow in flows:
+                print(f"State {state_hash} <- checkpoint {flow.checkpoint_hash} via {[t for t in flow.transition_refs]}")
+
+        # ----------------------------------------------------------------
+        # Result
+        # ----------------------------------------------------------------
+        print("\n--- Checks ---")
+        if errors:
+            for err in errors:
+                print(f"  FAIL: {err}")
+            print(f"\n{len(errors)} check(s) failed.")
+        else:
+            print("  All checks passed.")
+
+    finally:
+        await client.close()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python test_find_all_flows.py <session_id>")
+        sys.exit(1)
+
+    asyncio.run(main(sys.argv[1]))