colbymchenry · CansuKhon · Jun 15, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 - `codegraph index` now rebuilds the full graph from scratch, so it produces the same result as a fresh `codegraph init` instead of reporting "0 nodes, 0 edges" and looking like it wiped your index. Previously, re-running `index` on an unchanged project skipped every file (their contents hadn't changed) and showed an empty-looking summary; it now clears and re-indexes for an honest, complete rebuild every time. Use `codegraph sync` for fast incremental updates between full rebuilds. Thanks @Arc-univer. (#874)
 - The file watcher that auto-syncs the graph now fails cleanly when live watching can no longer be trusted, instead of looking healthy while the index quietly goes stale. If the operating system runs out of file-watch resources, or another process holds the write lock far longer than a normal save, CodeGraph now disables auto-sync once — with a single clear message telling you to run `codegraph sync` (or rely on the git sync hooks) to refresh — rather than retrying forever or repeating the same error on a loop. And while auto-sync is disabled, CodeGraph's tool responses (and `codegraph status`) now say so plainly, so your AI agent knows to read files directly instead of trusting a frozen index. This mostly matters for long-running MCP/daemon sessions, which could otherwise keep serving stale results while appearing to work. Thanks @thismilktea. (#876)
 - On Linux, hitting the kernel's inotify watch limit on a large project no longer silently leaves half the tree unwatched. CodeGraph now tells you once — naming the exact setting to raise (`fs.inotify.max_user_watches`, e.g. `sudo sysctl fs.inotify.max_user_watches=1048576`) — and keeps live-watching the directories it could register while `codegraph sync` (or the git sync hooks) covers the rest. (#876)
+- `codegraph sync` no longer fails with "too many SQL variables" on very large repositories. Syncing a project with tens of thousands of changed files at once would stop right after "Parsing code 100%" and leave the graph half-written; CodeGraph now batches those internal lookups so a sync completes cleanly no matter how big the project is.
 
 
 ## [1.0.1] - 2026-06-13

diff --git a/__tests__/sql-variable-overflow.test.ts b/__tests__/sql-variable-overflow.test.ts
@@ -0,0 +1,84 @@
+/**
+ * Regression: bulk reads/deletes must never bind one SQL variable per row.
+ *
+ * SQLite caps bound parameters per statement at SQLITE_MAX_VARIABLE_NUMBER
+ * (32766 on the node:sqlite build CodeGraph ships). A statement built as
+ * `... IN (?,?,…)` with one placeholder per id/path overflows that cap once a
+ * project is large enough, and `codegraph sync` aborts with "too many SQL
+ * variables" in the resolution write phase — right after "Parsing code 100%".
+ *
+ * Every project-size-scaling lookup/delete now binds its list as ONE JSON
+ * parameter expanded server-side via `json_each(?)`, so the bound-variable
+ * count is fixed at 1 regardless of project size. These tests drive each of
+ * those statements with > 32766 rows; they throw on the pre-fix code and pass
+ * on the json_each fix.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { DatabaseConnection } from '../src/db';
+import { QueryBuilder } from '../src/db/queries';
+
+// Comfortably past SQLITE_MAX_VARIABLE_NUMBER (32766) so a placeholder-per-row
+// statement is guaranteed to overflow.
+const N = 40000;
+
+describe('SQL variable overflow — bulk paths stay O(1) in bound params', () => {
+  let dir: string;
+  let conn: DatabaseConnection;
+  let db: ReturnType<DatabaseConnection['getDb']>;
+  let queries: QueryBuilder;
+
+  beforeEach(() => {
+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-overflow-'));
+    conn = DatabaseConnection.initialize(path.join(dir, 'codegraph.db'));
+    db = conn.getDb();
+    queries = new QueryBuilder(db);
+
+    // Seed N nodes and N unresolved refs (one per node) directly, fast, in a
+    // single transaction — we only need the rows to exist, not real extraction.
+    const insertNode = db.prepare(
+      `INSERT INTO nodes (id, kind, name, qualified_name, file_path, language,
+         start_line, end_line, start_column, end_column, updated_at)
+       VALUES (?, 'function', ?, ?, ?, 'javascript', 1, 1, 0, 0, 0)`
+    );
+    const insertRef = db.prepare(
+      `INSERT INTO unresolved_refs (from_node_id, reference_name, reference_kind, line, col, file_path, language)
+       VALUES (?, 'target', 'calls', 1, 0, ?, 'javascript')`
+    );
+    db.transaction(() => {
+      for (let i = 0; i < N; i++) {
+        const id = `node_${i}`;
+        const file = `src/file_${i}.js`;
+        insertNode.run(id, `fn_${i}`, `fn_${i}`, file);
+        insertRef.run(id, file);
+      }
+    })();
+  });
+
+  afterEach(() => {
+    conn.close();
+    fs.rmSync(dir, { recursive: true, force: true });
+  });
+
+  it(`getNodesByIds resolves ${N} ids without "too many SQL variables"`, () => {
+    const ids = Array.from({ length: N }, (_, i) => `node_${i}`);
+    const fresh = new QueryBuilder(db); // empty LRU → every id is a DB miss
+    const nodes = fresh.getNodesByIds(ids);
+    expect(nodes.size).toBe(N);
+  });
+
+  it(`getUnresolvedReferencesByFiles scans ${N} file paths without overflow`, () => {
+    const files = Array.from({ length: N }, (_, i) => `src/file_${i}.js`);
+    const refs = queries.getUnresolvedReferencesByFiles(files);
+    expect(refs.length).toBe(N);
+  });
+
+  it(`deleteResolvedReferences deletes ${N} from-node ids without overflow`, () => {
+    const ids = Array.from({ length: N }, (_, i) => `node_${i}`);
+    queries.deleteResolvedReferences(ids);
+    expect(queries.getUnresolvedReferencesCount()).toBe(0);
+  });
+});
diff --git a/src/db/queries.ts b/src/db/queries.ts
@@ -47,8 +47,6 @@ function isLowValueFile(filePath: string): boolean {
   );
 }
 
-const SQLITE_PARAM_CHUNK_SIZE = 500;
-
 /**
  * Database row types (snake_case from SQLite)
  */
@@ -469,20 +467,18 @@ export class QueryBuilder {
     }
     if (misses.length === 0) return out;
 
-    // Chunk under SQLite's parameter limit (default 999, raised to 32766
-    // in better-sqlite3 builds — chunk at 500 for safety across both
-    // backends and to keep the query plan simple).
-    for (let i = 0; i < misses.length; i += SQLITE_PARAM_CHUNK_SIZE) {
-      const chunk = misses.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
-      const placeholders = chunk.map(() => '?').join(',');
-      const rows = this.db
-        .prepare(`SELECT * FROM nodes WHERE id IN (${placeholders})`)
-        .all(...chunk) as NodeRow[];
-      for (const row of rows) {
-        const node = rowToNode(row);
-        out.set(node.id, node);
-        this.cacheNode(node);
-      }
+    // Bind the id list as ONE JSON parameter and expand it server-side with
+    // json_each, so the bound-parameter count is fixed at 1 regardless of how
+    // many ids we look up. A placeholder-per-id `IN (?,?,…)` overflows
+    // SQLITE_MAX_VARIABLE_NUMBER (32766) once a project is large enough — the
+    // root cause of the "too many SQL variables" sync failure.
+    const rows = this.db
+      .prepare(`SELECT * FROM nodes WHERE id IN (SELECT value FROM json_each(?))`)
+      .all(JSON.stringify(misses)) as NodeRow[];
+    for (const row of rows) {
+      const node = rowToNode(row);
+      out.set(node.id, node);
+      this.cacheNode(node);
     }
     return out;
   }
@@ -491,16 +487,15 @@ export class QueryBuilder {
     const out = new Set<string>();
     if (ids.length === 0) return out;
 
+    // One JSON parameter expanded via json_each — fixed at a single bound
+    // variable no matter how many ids, so it can never overflow
+    // SQLITE_MAX_VARIABLE_NUMBER on a large project.
     const uniqueIds = [...new Set(ids)];
-    for (let i = 0; i < uniqueIds.length; i += SQLITE_PARAM_CHUNK_SIZE) {
-      const chunk = uniqueIds.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
-      const placeholders = chunk.map(() => '?').join(',');
-      const rows = this.db
-        .prepare(`SELECT id FROM nodes WHERE id IN (${placeholders})`)
-        .all(...chunk) as { id: string }[];
-      for (const row of rows) {
-        out.add(row.id);
-      }
+    const rows = this.db
+      .prepare(`SELECT id FROM nodes WHERE id IN (SELECT value FROM json_each(?))`)
+      .all(JSON.stringify(uniqueIds)) as { id: string }[];
+    for (const row of rows) {
+      out.add(row.id);
     }
 
     return out;
@@ -1667,19 +1662,15 @@ export class QueryBuilder {
   getUnresolvedReferencesByFiles(filePaths: string[]): UnresolvedReference[] {
     if (filePaths.length === 0) return [];
 
-    // Chunk under SQLite's parameter limit: the first sync of a very large repo
-    // passes every changed file here, which an unbounded `IN (...)` would bind
-    // as one parameter each — exceeding MAX_VARIABLE_NUMBER and aborting with
+    // The first sync of a very large repo passes every changed file here. Bind
+    // the whole list as ONE JSON parameter and expand it server-side with
+    // json_each, so the statement uses a single bound variable regardless of
+    // file count — an `IN (?,?,…)` placeholder-per-file would bind one variable
+    // each and exceed SQLITE_MAX_VARIABLE_NUMBER (32766), aborting with
     // "too many SQL variables". (#540)
-    const rows: UnresolvedRefRow[] = [];
-    for (let i = 0; i < filePaths.length; i += SQLITE_PARAM_CHUNK_SIZE) {
-      const chunk = filePaths.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
-      const placeholders = chunk.map(() => '?').join(',');
-      const chunkRows = this.db
-        .prepare(`SELECT * FROM unresolved_refs WHERE file_path IN (${placeholders})`)
-        .all(...chunk) as UnresolvedRefRow[];
-      rows.push(...chunkRows);
-    }
+    const rows = this.db
+      .prepare(`SELECT * FROM unresolved_refs WHERE file_path IN (SELECT value FROM json_each(?))`)
+      .all(JSON.stringify(filePaths)) as UnresolvedRefRow[];
 
     return rows.map((row) => ({
       fromNodeId: row.from_node_id,
@@ -1705,8 +1696,12 @@ export class QueryBuilder {
    */
   deleteResolvedReferences(fromNodeIds: string[]): void {
     if (fromNodeIds.length === 0) return;
-    const placeholders = fromNodeIds.map(() => '?').join(',');
-    this.db.prepare(`DELETE FROM unresolved_refs WHERE from_node_id IN (${placeholders})`).run(...fromNodeIds);
+    // One JSON parameter expanded via json_each — a single bound variable
+    // regardless of how many ids resolve, so a large resolution batch can never
+    // overflow SQLITE_MAX_VARIABLE_NUMBER with "too many SQL variables".
+    this.db
+      .prepare(`DELETE FROM unresolved_refs WHERE from_node_id IN (SELECT value FROM json_each(?))`)
+      .run(JSON.stringify(fromNodeIds));
   }
 
   /**