From b934772150f0733aaddddbd270a22d2ca4447c83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tiberiu=20Sab=C4=83u?= Date: Mon, 2 Feb 2026 15:52:50 +0100 Subject: [PATCH] fix(utils): accept single column csv files --- docs/features/csv-support.md | 26 ++++++ src/lib/utils/fileDetector.ts | 24 ++++-- test/fixtures/single-column-cities.csv | 8 ++ test/fixtures/single-column-emails.csv | 3 + test/fixtures/single-column-ids.csv | 5 ++ test/fixtures/single-column-names.csv | 8 ++ test/unit/utils/csvProcessor.test.ts | 110 +++++++++++++++++++++++++ test/unit/utils/fileDetector.test.ts | 109 ++++++++++++++++++++++++ 8 files changed, 287 insertions(+), 6 deletions(-) create mode 100644 test/fixtures/single-column-cities.csv create mode 100644 test/fixtures/single-column-emails.csv create mode 100644 test/fixtures/single-column-ids.csv create mode 100644 test/fixtures/single-column-names.csv diff --git a/docs/features/csv-support.md b/docs/features/csv-support.md index 2a9182da7..990af8f5a 100644 --- a/docs/features/csv-support.md +++ b/docs/features/csv-support.md @@ -12,6 +12,30 @@ CSV support in NeuroLink works just like image support - it's a multimodal input 4. **Injects** formatted CSV data into your prompt text 5. **Works** with ALL AI providers (not limited to vision models) +### Single-Column CSV Support (CSV-009) + +NeuroLink fully supports **single-column CSV files**, which are commonly used for: + +- **Lists of IDs**: `ID123`, `ID456`, `ID789` +- **Names**: `Alice Johnson`, `Bob Smith`, `Charlie Brown` +- **Email addresses**: `alice@example.com`, `bob@company.org` +- **Cities/Locations**: `New York`, `Los Angeles`, `Chicago` +- **Product codes**, **account numbers**, **reference numbers**, etc. + +Single-column CSVs are automatically detected and processed like multi-column CSVs: + +```typescript +// Example: Analyze a list of customer IDs +const result = await neurolink.generate({ + input: { + text: "Analyze these customer IDs and identify any patterns", + csvFiles: ["customer-ids.csv"], // Single column: ID123, ID456, ID789... + }, +}); +``` + +The detection system uses data-like pattern validation to distinguish single-column CSVs from prose or other text formats. + ## Quick Start ### SDK Usage @@ -231,6 +255,8 @@ NeuroLink uses a **multi-strategy detection system** with confidence scores: 4. **Content Heuristics** (75% confidence) - Analyzes file content patterns - Detects CSV by checking consistent comma-separated columns + - **CSV-009**: Supports single-column CSVs (e.g., lists of IDs, names, emails) + - Single-column detection uses data-like pattern validation The system stops at the **first strategy with 80%+ confidence**. diff --git a/src/lib/utils/fileDetector.ts b/src/lib/utils/fileDetector.ts index 7a1fd02e2..7d8dd95ab 100644 --- a/src/lib/utils/fileDetector.ts +++ b/src/lib/utils/fileDetector.ts @@ -1088,19 +1088,31 @@ class ContentHeuristicStrategy implements DetectionStrategy { ); const noBinaryChars = !text.includes("\0"); - // Single-column CSVs should have VERY uniform line lengths - // (data values like IDs, codes, numbers - not varied content) + // CSV-009: Accept single-column CSVs with varied data + // Single-column CSVs can contain varied data (names, cities, emails, IDs, etc.) + // We check for data-like characteristics rather than strict uniformity const lengths = lines.map((l) => l.length); const avgLength = lengths.reduce((a, b) => a + b, 0) / lengths.length; const variance = lengths.reduce((sum, len) => sum + Math.pow(len - avgLength, 2), 0) / lengths.length; const stdDev = Math.sqrt(variance); - // Single-column CSVs can contain varied data (names, cities, emails, etc.) - // but should still show some consistency compared to random text - const hasUniformLengths = stdDev / avgLength < 0.75; - return hasReasonableLengths && noBinaryChars && hasUniformLengths; + // Relaxed coefficient of variation check (1.0 instead of 0.75) + // This allows for more variation while still filtering out random text/prose + const coefficientOfVariation = stdDev / avgLength; + const hasReasonableVariation = coefficientOfVariation < 1.0; + + // Additional check: At least 50% of lines should be non-empty after trimming + const nonEmptyLines = lines.filter((l) => l.trim().length > 0).length; + const hasEnoughContent = nonEmptyLines / lines.length >= 0.5; + + return ( + hasReasonableLengths && + noBinaryChars && + hasReasonableVariation && + hasEnoughContent + ); } // Count delimiters per line and check consistency diff --git a/test/fixtures/single-column-cities.csv b/test/fixtures/single-column-cities.csv new file mode 100644 index 000000000..82ebdb452 --- /dev/null +++ b/test/fixtures/single-column-cities.csv @@ -0,0 +1,8 @@ +New York +Los Angeles +Chicago +Houston +Phoenix +PhiladelphiaSan Antonio +San Diego +Dallas diff --git a/test/fixtures/single-column-emails.csv b/test/fixtures/single-column-emails.csv new file mode 100644 index 000000000..7a2e91bf2 --- /dev/null +++ b/test/fixtures/single-column-emails.csv @@ -0,0 +1,3 @@ +alice@example.com +bob.smith@company.org +charlie.brown@mail.com \ No newline at end of file diff --git a/test/fixtures/single-column-ids.csv b/test/fixtures/single-column-ids.csv new file mode 100644 index 000000000..613034fe4 --- /dev/null +++ b/test/fixtures/single-column-ids.csv @@ -0,0 +1,5 @@ +ID123 +ID456 +ID789 +ID101 +ID202 \ No newline at end of file diff --git a/test/fixtures/single-column-names.csv b/test/fixtures/single-column-names.csv new file mode 100644 index 000000000..d3456a46a --- /dev/null +++ b/test/fixtures/single-column-names.csv @@ -0,0 +1,8 @@ +Alice Johnson +Bob Smith +Charlie Brown +Diana Martinez +Edward Thompson +Frank Wilson +Grace Lee +Hannah Davis \ No newline at end of file diff --git a/test/unit/utils/csvProcessor.test.ts b/test/unit/utils/csvProcessor.test.ts index d57ef8058..42f3fe39b 100644 --- a/test/unit/utils/csvProcessor.test.ts +++ b/test/unit/utils/csvProcessor.test.ts @@ -546,4 +546,114 @@ Charlie,35,Chicago`; expect(rawResult.metadata.totalLines).toBe(5); // header + 2 data + 2 whitespace }); }); + + describe("CSV-009: Single-column CSV support", () => { + it("should process single-column CSV with IDs", async () => { + const csvData = Buffer.from("ID123\nID456\nID789\nID101"); + + const rawResult = await CSVProcessor.process(csvData, { + formatStyle: "raw", + }); + const jsonResult = await CSVProcessor.process(csvData, { + formatStyle: "json", + }); + + expect(rawResult.type).toBe("csv"); + expect(rawResult.metadata.columnCount).toBe(1); + expect(rawResult.metadata.rowCount).toBeGreaterThanOrEqual(3); + expect(rawResult.content).toContain("ID123"); + + expect(jsonResult.type).toBe("csv"); + expect(jsonResult.metadata.columnCount).toBe(1); + }); + + it("should process single-column CSV with names", async () => { + const csvData = Buffer.from( + "Alice Johnson\nBob Smith\nCharlie Brown\nDiana Martinez", + ); + + const rawResult = await CSVProcessor.process(csvData, { + formatStyle: "raw", + }); + + expect(rawResult.type).toBe("csv"); + expect(rawResult.metadata.columnCount).toBe(1); + expect(rawResult.metadata.rowCount).toBeGreaterThanOrEqual(3); + expect(rawResult.content).toContain("Alice Johnson"); + expect(rawResult.content).toContain("Bob Smith"); + }); + + it("should process single-column CSV with emails", async () => { + const csvData = Buffer.from( + "alice@example.com\nbob.smith@company.org\ncharlie.brown@mail.com", + ); + + const jsonResult = await CSVProcessor.process(csvData, { + formatStyle: "json", + }); + + expect(jsonResult.type).toBe("csv"); + expect(jsonResult.metadata.columnCount).toBe(1); + // First row becomes header, so 2 data rows + expect(jsonResult.metadata.rowCount).toBe(2); + expect(jsonResult.content).toContain("alice@example.com"); + }); + + it("should process single-column CSV with cities (varied lengths)", async () => { + const csvData = Buffer.from( + "New York\nLos Angeles\nChicago\nHouston\nPhoenix\nPhiladelphia", + ); + + const rawResult = await CSVProcessor.process(csvData, { + formatStyle: "raw", + }); + + expect(rawResult.type).toBe("csv"); + expect(rawResult.metadata.columnCount).toBe(1); + // First row becomes header, so 5 data rows + expect(rawResult.metadata.rowCount).toBe(5); + expect(rawResult.content).toContain("New York"); + expect(rawResult.content).toContain("Philadelphia"); + }); + + it("should respect maxRows for single-column CSVs", async () => { + const csvData = Buffer.from("Item1\nItem2\nItem3\nItem4\nItem5\nItem6"); + + const result = await CSVProcessor.process(csvData, { + formatStyle: "raw", + maxRows: 3, + }); + + expect(result.metadata.rowCount).toBe(3); + expect(result.content).toContain("Item1"); + expect(result.content).toContain("Item3"); + expect(result.content).not.toContain("Item6"); + }); + + it("should handle single-column CSV in markdown format", async () => { + const csvData = Buffer.from("Product\nLaptop\nMouse\nKeyboard"); + + const result = await CSVProcessor.process(csvData, { + formatStyle: "markdown", + }); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + expect(result.content).toContain("Product"); + expect(result.content).toContain("Laptop"); + }); + + it("should log success for single-column CSV processing", async () => { + const csvData = Buffer.from("Value1\nValue2\nValue3"); + await CSVProcessor.process(csvData, { formatStyle: "raw" }); + + expect(logger.info).toHaveBeenCalledWith( + "[CSVProcessor] ✅ Processed CSV file", + expect.objectContaining({ + formatStyle: "raw", + columnCount: 1, + }), + ); + }); + }); }); diff --git a/test/unit/utils/fileDetector.test.ts b/test/unit/utils/fileDetector.test.ts index 2a7d99aae..b1e535090 100644 --- a/test/unit/utils/fileDetector.test.ts +++ b/test/unit/utils/fileDetector.test.ts @@ -494,4 +494,113 @@ describe("FileDetector", () => { expect(result.type).toBe("pdf"); }); }); + + describe("CSV-009: Single-column CSV detection", () => { + it("should detect and process single-column CSV with IDs", async () => { + const csvPath = join(fixturesPath, "single-column-ids.csv"); + const result = await FileDetector.detectAndProcess(csvPath); + + expect(result.type).toBe("csv"); + expect(result.mimeType).toBe("text/csv"); + expect(result.metadata.columnCount).toBe(1); + expect(result.metadata.rowCount).toBeGreaterThanOrEqual(3); + expect(result.content).toContain("ID123"); + }); + + it("should detect and process single-column CSV with names", async () => { + const csvPath = join(fixturesPath, "single-column-names.csv"); + const result = await FileDetector.detectAndProcess(csvPath); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + expect(result.metadata.rowCount).toBeGreaterThanOrEqual(5); + expect(result.content).toContain("Alice Johnson"); + expect(result.content).toContain("Bob Smith"); + }); + + it("should detect and process single-column CSV with cities (varied lengths)", async () => { + const csvPath = join(fixturesPath, "single-column-cities.csv"); + const result = await FileDetector.detectAndProcess(csvPath); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + expect(result.metadata.rowCount).toBeGreaterThanOrEqual(5); + expect(result.content).toContain("New York"); + expect(result.content).toContain("Los Angeles"); + expect(result.content).toContain("Philadelphia"); + }); + + it("should detect and process single-column CSV with emails", async () => { + const csvPath = join(fixturesPath, "single-column-emails.csv"); + const result = await FileDetector.detectAndProcess(csvPath); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + // First row becomes header, so at least 2 data rows + expect(result.metadata.rowCount).toBeGreaterThanOrEqual(2); + expect(result.content).toContain("alice@example.com"); + expect(result.content).toContain("bob.smith@company.org"); + }); + + it("should detect single-column CSV from Buffer", async () => { + const csvBuffer = Buffer.from("Item1\nItem2\nItem3\nItem4"); + const result = await FileDetector.detectAndProcess(csvBuffer); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + // First row becomes header, so 3 data rows + expect(result.metadata.rowCount).toBe(3); + }); + + it("should respect allowedTypes for single-column CSVs", async () => { + const csvPath = join(fixturesPath, "single-column-ids.csv"); + const result = await FileDetector.detectAndProcess(csvPath, { + allowedTypes: ["csv"], + }); + + expect(result.type).toBe("csv"); + expect(result.metadata.columnCount).toBe(1); + }); + + it("should apply CSV options to single-column CSVs", async () => { + const csvPath = join(fixturesPath, "single-column-names.csv"); + const result = await FileDetector.detectAndProcess(csvPath, { + csvOptions: { + maxRows: 3, + formatStyle: "json", + }, + }); + + expect(result.type).toBe("csv"); + expect(result.metadata.rowCount).toBe(3); + expect(result.metadata.columnCount).toBe(1); + }); + + it("should not detect prose/sentences as single-column CSV", async () => { + const proseBuffer = Buffer.from( + "This is a normal sentence with many words.\nHere is another sentence with even more words in it.\nAnd a third sentence to complete the paragraph.", + ); + const result = await FileDetector.detectAndProcess(proseBuffer); + + // Should be detected as text, not CSV (prose has >4 words per line) + expect(result.type).toBe("text"); + }); + + it("should not detect JSON as single-column CSV", async () => { + const jsonBuffer = Buffer.from('[{"name": "Alice"}, {"name": "Bob"}]'); + const result = await FileDetector.detectAndProcess(jsonBuffer); + + // Should be detected as text (JSON), not CSV + expect(result.type).toBe("text"); + expect(result.mimeType).toContain("json"); + }); + + it("should not detect YAML as single-column CSV", async () => { + const yamlBuffer = Buffer.from("---\nkey1: value1\nkey2: value2\n"); + const result = await FileDetector.detectAndProcess(yamlBuffer); + + // Should be detected as text (YAML), not CSV + expect(result.type).toBe("text"); + }); + }); });