From a43b956b40db179fd604a0748c39f12647fcdb8f Mon Sep 17 00:00:00 2001 From: waleed Date: Sat, 27 Jun 2026 18:42:57 -0700 Subject: [PATCH 1/7] feat(chat): support zip uploads as virtual folders in the copilot VFS Accept .zip chat attachments and present each archive as a virtual folder the agent lists and reads entry-by-entry. The archive is stored once; entries are extracted lazily on read, reusing the existing file-parsers and zip-bomb / zip-slip guards. No changes to the Go copilot service. - allow zip in the attachment allowlist + chat accept attribute - shared lib/uploads/archive.ts (factored from the file-manage decompress route) - split readFileRecord into a pure renderFileBuffer reused for in-zip entries - single-resolve readChatUploadPath/grepChatUploadPath dispatchers + VFS routing - inline file tree in the upload context message --- apps/sim/app/api/tools/file/manage/route.ts | 107 +------- apps/sim/lib/copilot/chat/payload.ts | 66 ++++- .../tools/handlers/upload-file-reader.test.ts | 111 +++++++- .../tools/handlers/upload-file-reader.ts | 254 +++++++++++++++-- .../lib/copilot/tools/handlers/vfs.test.ts | 141 ++++++++-- apps/sim/lib/copilot/tools/handlers/vfs.ts | 62 ++++- apps/sim/lib/copilot/vfs/file-reader.test.ts | 72 ++++- apps/sim/lib/copilot/vfs/file-reader.ts | 259 +++++++++++------- apps/sim/lib/uploads/archive.test.ts | 123 +++++++++ apps/sim/lib/uploads/archive.ts | 209 ++++++++++++++ apps/sim/lib/uploads/utils/file-utils.ts | 16 +- apps/sim/lib/uploads/utils/validation.test.ts | 11 +- apps/sim/lib/uploads/utils/validation.ts | 24 +- 13 files changed, 1183 insertions(+), 272 deletions(-) create mode 100644 apps/sim/lib/uploads/archive.test.ts create mode 100644 apps/sim/lib/uploads/archive.ts diff --git a/apps/sim/app/api/tools/file/manage/route.ts b/apps/sim/app/api/tools/file/manage/route.ts index 367a5db8cfc..a29c47613fe 100644 --- a/apps/sim/app/api/tools/file/manage/route.ts +++ b/apps/sim/app/api/tools/file/manage/route.ts @@ -1,5 +1,4 @@ import { Buffer, isUtf8 } from 'buffer' -import type { Readable } from 'stream' import { AuditAction, AuditResourceType, recordAudit } from '@sim/audit' import { createLogger } from '@sim/logger' import { getErrorMessage } from '@sim/utils/errors' @@ -21,6 +20,16 @@ import { ShareValidationError, upsertFileShare, } from '@/lib/public-shares/share-manager' +import { + inflateEntryWithinCaps, + isSymlinkEntry, + MAX_ARCHIVE_BYTES as MAX_DECOMPRESS_ARCHIVE_BYTES, + MAX_ARCHIVE_ENTRIES as MAX_DECOMPRESS_ENTRIES, + MAX_ARCHIVE_ENTRY_BYTES as MAX_DECOMPRESS_ENTRY_BYTES, + MAX_ARCHIVE_TOTAL_BYTES as MAX_DECOMPRESS_TOTAL_BYTES, + readEntryUncompressedSize, + sanitizeArchiveEntryPath, +} from '@/lib/uploads/archive' import { ensureWorkspaceFileFolderPath } from '@/lib/uploads/contexts/workspace/workspace-file-folder-manager' import { fetchWorkspaceFileBuffer, @@ -199,102 +208,6 @@ const uniqueZipEntryName = (name: string, usedNames: Set): string => { return candidate } -/** Input archive download cap for the decompress operation. */ -const MAX_DECOMPRESS_ARCHIVE_BYTES = 100 * 1024 * 1024 -/** Maximum number of entries extracted from a single archive. */ -const MAX_DECOMPRESS_ENTRIES = 1000 -/** Maximum uncompressed size for any single archive entry. */ -const MAX_DECOMPRESS_ENTRY_BYTES = 100 * 1024 * 1024 -/** Maximum total uncompressed size across all entries, to bound zip-bomb expansion. */ -const MAX_DECOMPRESS_TOTAL_BYTES = 200 * 1024 * 1024 - -const S_IFMT = 0o170000 -const S_IFLNK = 0o120000 - -/** - * Read a zip entry's declared uncompressed size without materializing it. This - * value comes straight from the (attacker-controlled) ZIP metadata, so it is only - * usable as a cheap fast-reject for honestly-declared archives — never as the - * authoritative cap. {@link inflateEntryWithinCaps} enforces the real limit on the - * inflated byte stream. - */ -const readEntryUncompressedSize = (entry: JSZip.JSZipObject): number | undefined => { - const data = (entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } })._data - const size = data?.uncompressedSize - return typeof size === 'number' && Number.isFinite(size) ? size : undefined -} - -type InflateResult = { ok: true; buffer: Buffer } | { ok: false; reason: 'entry' | 'total' } - -/** - * Inflate a single zip entry through a streaming counting sink, tearing the - * stream down the moment cumulative output would exceed the per-entry cap or the - * remaining total budget. The declared uncompressed size in the ZIP header is - * attacker-controlled and is NOT trusted here: a forged-small or absent size - * cannot cause the full (potentially gigabyte-scale) entry to be materialized in - * memory, because enforcement happens on the actual inflated bytes as they - * arrive. Peak memory is bounded by the cap plus one DEFLATE chunk. - */ -const inflateEntryWithinCaps = ( - entry: JSZip.JSZipObject, - remainingTotalBudget: number -): Promise => - new Promise((resolve, reject) => { - const chunks: Buffer[] = [] - let size = 0 - let settled = false - const stream = entry.nodeStream() as Readable - - const settle = (result: InflateResult) => { - if (settled) return - settled = true - stream.destroy() - resolve(result) - } - - stream.on('data', (chunk: Buffer) => { - size += chunk.length - if (size > MAX_DECOMPRESS_ENTRY_BYTES) { - settle({ ok: false, reason: 'entry' }) - return - } - if (size > remainingTotalBudget) { - settle({ ok: false, reason: 'total' }) - return - } - chunks.push(chunk) - }) - stream.on('end', () => settle({ ok: true, buffer: Buffer.concat(chunks, size) })) - stream.on('error', (error) => { - if (settled) return - settled = true - stream.destroy() - reject(error) - }) - }) - -/** True when a zip entry's unix mode marks it as a symlink (never extracted). */ -const isSymlinkEntry = (entry: JSZip.JSZipObject): boolean => { - const mode = (entry as JSZip.JSZipObject & { unixPermissions?: number | null }).unixPermissions - return typeof mode === 'number' && (mode & S_IFMT) === S_IFLNK -} - -/** - * Normalize a zip entry path into safe workspace folder segments, guarding against - * zip-slip. Returns null for traversal (`..`), so the entry is skipped rather than - * written outside its intended location. - */ -const sanitizeArchiveEntryPath = (rawPath: string): string[] | null => { - const segments = rawPath - .replace(/\\/g, '/') - .split('/') - .map((segment) => segment.trim()) - .filter((segment) => segment.length > 0 && segment !== '.') - - if (segments.length === 0 || segments.includes('..')) return null - return segments -} - const isLikelyTextBuffer = (buffer: Buffer): boolean => isUtf8(buffer) && !buffer.includes(0) /** diff --git a/apps/sim/lib/copilot/chat/payload.ts b/apps/sim/lib/copilot/chat/payload.ts index e718b33cce3..e2f8c0dcab0 100644 --- a/apps/sim/lib/copilot/chat/payload.ts +++ b/apps/sim/lib/copilot/chat/payload.ts @@ -7,13 +7,20 @@ import type { VfsSnapshotV1 } from '@/lib/copilot/generated/vfs-snapshot-v1' import { getExposedIntegrationTools } from '@/lib/copilot/integration-tools' import { getToolEntry } from '@/lib/copilot/tool-executor/router' import { getCopilotToolDescription } from '@/lib/copilot/tools/descriptions' +import { + type ChatUploadArchiveEntry, + listChatUploadArchiveEntries, +} from '@/lib/copilot/tools/handlers/upload-file-reader' import { encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { isE2BDocEnabled, isHosted } from '@/lib/core/config/env-flags' import { buildUserSkillTool } from '@/lib/mothership/skills' import { trackChatUpload } from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { isArchiveFileName } from '@/lib/uploads/utils/file-utils' import { stripVersionSuffix } from '@/tools/utils' const logger = createLogger('CopilotChatPayload') +/** Max archive entries listed inline in the upload context before truncating. */ +const MAX_UPLOAD_TREE_ENTRIES = 50 const INTEGRATION_TOOL_SCHEMA_CACHE_TTL_MS = 5_000 const INTEGRATION_TOOL_SCHEMA_CACHE_MAX_ENTRIES = 500 @@ -297,15 +304,56 @@ export async function buildCopilotRequestPayload( } catch { encodedUploadName = displayName } - const lines = [ - `File "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, - `Read with: read("uploads/${encodedUploadName}")`, - `To save permanently: materialize_file(fileName: "${displayName}")`, - ] - if (displayName.endsWith('.json')) { - lines.push( - `To import as a workflow: materialize_file(fileName: "${displayName}", operation: "import")` - ) + let lines: string[] + if (isArchiveFileName(displayName)) { + // An archive is presented as a virtual folder. Show a capped file tree + // up front so the agent sees the contents without a glob round-trip; + // degrade to a glob hint if the tree can't be built (never block send). + let entries: ChatUploadArchiveEntry[] | null = null + try { + entries = await listChatUploadArchiveEntries(displayName, chatId) + } catch (treeErr) { + logger.warn('Failed to build archive upload tree', { + filename, + chatId, + error: toError(treeErr).message, + }) + } + if (entries && entries.length > 0) { + const shown = entries.slice(0, MAX_UPLOAD_TREE_ENTRIES) + const treeLines = shown.map((entry) => ` ${entry.path}`) + if (entries.length > MAX_UPLOAD_TREE_ENTRIES) { + treeLines.push(` … and ${entries.length - MAX_UPLOAD_TREE_ENTRIES} more`) + } + lines = [ + `Archive "${displayName}" (${mediaType}, ${f.size} bytes) uploaded — ${ + entries.length + } file${entries.length === 1 ? '' : 's'}:`, + ...treeLines, + '', + `List entries with: glob("uploads/${encodedUploadName}/*")`, + `Read an entry with: read("uploads/${encodedUploadName}/")`, + `To save the archive permanently: materialize_file(fileName: "${displayName}")`, + ] + } else { + lines = [ + `Archive "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, + `List entries with: glob("uploads/${encodedUploadName}/*")`, + `Read an entry with: read("uploads/${encodedUploadName}/")`, + `To save the archive permanently: materialize_file(fileName: "${displayName}")`, + ] + } + } else { + lines = [ + `File "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, + `Read with: read("uploads/${encodedUploadName}")`, + `To save permanently: materialize_file(fileName: "${displayName}")`, + ] + if (displayName.endsWith('.json')) { + lines.push( + `To import as a workflow: materialize_file(fileName: "${displayName}", operation: "import")` + ) + } } uploadContexts.push({ type: 'uploaded_file', diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index 065b287b99e..4c4b14d9d6f 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -2,25 +2,46 @@ * @vitest-environment node */ +import { Buffer } from 'buffer' import { dbChainMock, dbChainMockFns, resetDbChainMock } from '@sim/testing' +import JSZip from 'jszip' import { beforeEach, describe, expect, it, vi } from 'vitest' vi.mock('@sim/db', () => dbChainMock) -const { mockReadFileRecord } = vi.hoisted(() => ({ - mockReadFileRecord: vi.fn(), -})) +const { mockReadFileRecord, mockRenderFileBuffer, mockFetchWorkspaceFileBuffer } = vi.hoisted( + () => ({ + mockReadFileRecord: vi.fn(), + // Echo the entry bytes back as text so a successful resolve is observable. + mockRenderFileBuffer: vi.fn(async (buffer: Buffer) => ({ + content: buffer.toString('utf-8'), + totalLines: 1, + })), + mockFetchWorkspaceFileBuffer: vi.fn(), + }) +) vi.mock('@/lib/copilot/vfs/file-reader', () => ({ readFileRecord: mockReadFileRecord, + renderFileBuffer: mockRenderFileBuffer, +})) +vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({ + fetchWorkspaceFileBuffer: mockFetchWorkspaceFileBuffer, })) import { findMothershipUploadRowByChatAndName, + listChatUploadArchiveEntries, listChatUploads, - readChatUpload, + readChatUploadPath, } from './upload-file-reader' +async function buildZip(files: Record): Promise { + const zip = new JSZip() + for (const [name, content] of Object.entries(files)) zip.file(name, content) + return Buffer.from(await zip.generateAsync({ type: 'uint8array' })) +} + const CHAT_ID = '11111111-1111-1111-1111-111111111111' const NOW = new Date('2026-05-05T00:00:00.000Z') @@ -147,7 +168,7 @@ describe('listChatUploads', () => { }) }) -describe('readChatUpload', () => { +describe('readChatUploadPath (plain upload)', () => { beforeEach(() => { vi.clearAllMocks() resetDbChainMock() @@ -159,7 +180,7 @@ describe('readChatUpload', () => { mockOrderByThenLimit([row]) mockReadFileRecord.mockResolvedValueOnce({ content: 'PNGDATA', totalLines: 1 }) - const result = await readChatUpload('image (2).png', CHAT_ID) + const result = await readChatUploadPath('image (2).png', '', CHAT_ID) expect(result).toEqual({ content: 'PNGDATA', totalLines: 1 }) expect(mockReadFileRecord).toHaveBeenCalledWith( @@ -167,13 +188,89 @@ describe('readChatUpload', () => { ) }) + it('ignores a trailing habit suffix on a non-archive upload', async () => { + const row = makeRow({ id: 'wf_3', displayName: 'report.csv', contentType: 'text/csv' }) + mockOrderByThenLimit([row]) + mockReadFileRecord.mockResolvedValueOnce({ content: 'a,b', totalLines: 1 }) + + const result = await readChatUploadPath('report.csv', 'content', CHAT_ID) + + expect(result).toEqual({ content: 'a,b', totalLines: 1 }) + expect(mockReadFileRecord).toHaveBeenCalledWith(expect.objectContaining({ name: 'report.csv' })) + }) + it('returns null when no row matches', async () => { mockOrderByThenLimit([]) dbChainMockFns.orderBy.mockResolvedValueOnce([] as never) - const result = await readChatUpload('nope.png', CHAT_ID) + const result = await readChatUploadPath('nope.png', '', CHAT_ID) expect(result).toBeNull() expect(mockReadFileRecord).not.toHaveBeenCalled() }) }) + +describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { + beforeEach(() => { + vi.clearAllMocks() + resetDbChainMock() + }) + + it('lists archive entries as encoded VFS paths', async () => { + const buffer = await buildZip({ 'report.pdf': 'x', 'data/sheet.csv': 'a,b' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const entries = await listChatUploadArchiveEntries('bundle.zip', CHAT_ID) + + expect(entries?.map((e) => e.vfsPath).sort()).toEqual([ + 'uploads/bundle.zip/data/sheet.csv', + 'uploads/bundle.zip/report.pdf', + ]) + }) + + it('reads a nested entry by its exact path', async () => { + const buffer = await buildZip({ 'data/sheet.csv': 'a,b\n1,2' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const result = await readChatUploadPath('bundle.zip', 'data/sheet.csv', CHAT_ID) + + expect(result?.content).toBe('a,b\n1,2') + }) + + it('resolves a unicode (NFD) entry addressed by its NFC-encoded glob path', async () => { + // macOS-authored zip: entry name stored decomposed (e + combining acute). + const nfdName = `cafe\u0301.txt` // NFD: e + combining acute + const buffer = await buildZip({ [nfdName]: 'latte' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + // The agent reads back the encoded path glob produced (NFC, percent-encoded). + const result = await readChatUploadPath('bundle.zip', 'caf%C3%A9.txt', CHAT_ID) + + expect(result?.content).toBe('latte') + }) + + it('returns null for an entry that is not in the archive', async () => { + const buffer = await buildZip({ 'present.txt': 'x' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const result = await readChatUploadPath('bundle.zip', 'missing.txt', CHAT_ID) + + expect(result).toBeNull() + }) + + it('returns the file-tree manifest for a bare archive read', async () => { + const buffer = await buildZip({ 'report.pdf': 'x', 'data/sheet.csv': 'a,b' }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const result = await readChatUploadPath('bundle.zip', '', CHAT_ID) + + expect(result?.content).toContain('Archive "bundle.zip" — 2 files') + expect(result?.content).toContain('report.pdf') + expect(result?.content).toContain('data/sheet.csv') + }) +}) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 0e914229c8a..6d919be7e08 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -3,7 +3,11 @@ import { workspaceFiles } from '@sim/db/schema' import { createLogger } from '@sim/logger' import { toError } from '@sim/utils/errors' import { and, asc, desc, eq, isNull, or } from 'drizzle-orm' -import { type FileReadResult, readFileRecord } from '@/lib/copilot/vfs/file-reader' +import { + type FileReadResult, + readFileRecord, + renderFileBuffer, +} from '@/lib/copilot/vfs/file-reader' import { type GrepCountEntry, type GrepMatch, @@ -13,7 +17,16 @@ import { } from '@/lib/copilot/vfs/operations' import { decodeVfsSegment, encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { getServePathPrefix } from '@/lib/uploads' -import type { WorkspaceFileRecord } from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { ArchiveError, extractArchiveEntry, listArchiveEntries } from '@/lib/uploads/archive' +import { + fetchWorkspaceFileBuffer, + type WorkspaceFileRecord, +} from '@/lib/uploads/contexts/workspace/workspace-file-manager' +import { + getFileExtension, + getMimeTypeFromExtension, + isArchiveFileName, +} from '@/lib/uploads/utils/file-utils' const logger = createLogger('UploadFileReader') @@ -140,21 +153,187 @@ export async function listChatUploads(chatId: string): Promise { + try { + return decodeVfsSegment(segment) + } catch { + return segment + } + }) + .join('/') +} + +/** Re-encode a real `/`-joined entry path into its VFS-safe per-segment form. */ +function encodeEntryPath(path: string): string { + return path + .split('/') + .map((segment) => encodeVfsSegment(segment)) + .join('/') +} + +/** + * Canonical per-segment-encoded key for an archive entry path. Returns null for + * paths that cannot be encoded (empty/dot segments). + */ +function archiveEntryKey(path: string): string | null { + try { + return encodeEntryPath(path) + } catch { + return null + } +} + +/** + * Resolve a requested entry path (percent-encoded as the agent received it from + * glob, or the raw display form from the manifest) to the archive's exact stored + * path. Matching is on the canonical key so the NFC + whitespace normalization + * `encodeVfsSegment` applies stays symmetric between the listed paths and the + * read request — otherwise a macOS-authored (NFD / U+202F) entry name would list + * but never resolve. Returns null when nothing matches. + */ +async function findArchiveEntryRawPath( + archiveBuffer: Buffer, + requestedEntryPath: string +): Promise { + const wantedKey = archiveEntryKey(decodeEntryPath(requestedEntryPath)) + if (!wantedKey) return null + const entries = await listArchiveEntries(archiveBuffer) + return entries.find((entry) => archiveEntryKey(entry) === wantedKey) ?? null +} + +/** A single entry within an uploaded archive, with both its real and VFS paths. */ +export interface ChatUploadArchiveEntry { + /** Real sanitized path inside the archive (e.g. `data/sheet.csv`). */ + path: string + /** VFS path the agent uses to read it (e.g. `uploads/archive.zip/data/sheet.csv`). */ + vfsPath: string +} + +/** + * List the entries of an uploaded archive as VFS paths. Returns null when + * `zipName` is not an archive upload in this chat; returns `[]` when the archive + * is unreadable or empty (logged) so the caller still surfaces the archive leaf. + */ +export async function listChatUploadArchiveEntries( + zipName: string, + chatId: string +): Promise { + const row = await findMothershipUploadRowByChatAndName(chatId, zipName) + if (!row) return null + const record = toWorkspaceFileRecord(row) + if (!isArchiveUpload(record)) return null + + const encodedZip = canonicalUploadKey(record.name) + try { + const buffer = await fetchWorkspaceFileBuffer(record) + const entries = await listArchiveEntries(buffer) + return entries.map((path) => ({ + path, + vfsPath: `uploads/${encodedZip}/${encodeEntryPath(path)}`, + })) + } catch (err) { + logger.warn('Failed to list archive entries', { + zipName, + chatId, + error: toError(err).message, + }) + return [] + } +} + +/** + * Render one archive entry from the archive buffer with the same extraction + * logic as a stored upload. Returns null when the entry is missing; returns a + * placeholder result for cap violations. + */ +async function readArchiveEntry( + archiveBuffer: Buffer, + entryPath: string +): Promise { + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) return null + let entryBuffer: Buffer | null + try { + entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + } catch (err) { + if (err instanceof ArchiveError) { + return { content: `[${err.message}]`, totalLines: 1 } + } + throw err + } + if (!entryBuffer) return null + const ext = getFileExtension(rawPath) + return renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) +} + +/** + * Build a file-tree manifest for a bare archive read (`read("uploads/x.zip")`), + * so the agent gets the contents instead of binary bytes. Returns a placeholder + * result when the archive is unreadable. */ -export async function readChatUpload( - filename: string, +async function buildArchiveManifest( + record: WorkspaceFileRecord, + archiveBuffer: Buffer +): Promise { + const encodedZip = canonicalUploadKey(record.name) + try { + const entries = await listArchiveEntries(archiveBuffer) + const header = `Archive "${record.name}" — ${entries.length} file${ + entries.length === 1 ? '' : 's' + }. Read an entry with read("uploads/${encodedZip}/").` + const content = [header, '', ...entries].join('\n') + return { content, totalLines: content.split('\n').length } + } catch (err) { + if (err instanceof ArchiveError) { + return { content: `[${err.message}]`, totalLines: 1 } + } + throw err + } +} + +/** + * Read a chat upload addressed by its first path segment and an optional entry + * path, resolving the upload row exactly once. A plain upload renders directly + * (a trailing habit suffix like `/content` is ignored); an archive returns the + * addressed entry, or its file-tree manifest when no entry is given. Resolves + * names like {@link findMothershipUploadRowByChatAndName} so visually equivalent + * spellings (e.g. macOS U+202F vs ASCII space) still match. + */ +export async function readChatUploadPath( + firstSegment: string, + entryPath: string, chatId: string ): Promise { try { - const row = await findMothershipUploadRowByChatAndName(chatId, filename) + const row = await findMothershipUploadRowByChatAndName(chatId, firstSegment) if (!row) return null - return readFileRecord(toWorkspaceFileRecord(row)) + const record = toWorkspaceFileRecord(row) + if (!isArchiveUpload(record)) { + return await readFileRecord(record) + } + const archiveBuffer = await fetchWorkspaceFileBuffer(record) + return entryPath + ? await readArchiveEntry(archiveBuffer, entryPath) + : await buildArchiveManifest(record, archiveBuffer) } catch (err) { logger.warn('Failed to read chat upload', { - filename, + firstSegment, + entryPath, chatId, error: toError(err).message, }) @@ -163,29 +342,62 @@ export async function readChatUpload( } /** - * Grep the content of a single chat upload (`uploads/`), mirroring - * {@link WorkspaceVFS.grepFile} for the chat-scoped uploads namespace. Resolves - * the upload by name (raw or percent-encoded), reads its text per file type, and - * greps it. Throws {@link WorkspaceFileGrepError} when the upload is missing or - * has no searchable text (image/binary/too-large) so the caller surfaces the - * message verbatim. + * Grep a chat upload addressed by its first path segment and an optional entry + * path, resolving the upload row exactly once and mirroring + * {@link WorkspaceVFS.grepFile} for the chat-scoped namespace. An archive entry + * is grepped from the archive; otherwise the upload itself is grepped (a trailing + * habit suffix on a non-archive is ignored). Throws {@link WorkspaceFileGrepError} + * when the upload/entry is missing or has no searchable text so the caller + * surfaces the message verbatim. */ -export async function grepChatUpload( - filename: string, +export async function grepChatUploadPath( + firstSegment: string, + entryPath: string, chatId: string, pattern: string, options?: GrepOptions ): Promise { - const row = await findMothershipUploadRowByChatAndName(chatId, filename) + const row = await findMothershipUploadRowByChatAndName(chatId, firstSegment) if (!row) { throw new WorkspaceFileGrepError( - `Upload not found: "${filename}". Use glob("uploads/*") to list available uploads.` + `Upload not found: "${firstSegment}". Use glob("uploads/*") to list available uploads.` ) } const record = toWorkspaceFileRecord(row) + + if (entryPath && isArchiveUpload(record)) { + const archiveBuffer = await fetchWorkspaceFileBuffer(record) + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) { + throw new WorkspaceFileGrepError( + `Archive entry not found: "${decodeEntryPath(entryPath)}" in "${record.name}".` + ) + } + let entryBuffer: Buffer | null + try { + entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + } catch (err) { + if (err instanceof ArchiveError) { + throw new WorkspaceFileGrepError(err.message) + } + throw err + } + if (!entryBuffer) { + throw new WorkspaceFileGrepError(`Archive entry not found: "${rawPath}" in "${record.name}".`) + } + const ext = getFileExtension(rawPath) + const result = await renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) + const uploadsPath = `uploads/${canonicalUploadKey(record.name)}/${encodeEntryPath(rawPath)}` + return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) + } + const result = await readFileRecord(record) if (!result) { - throw new WorkspaceFileGrepError(`Upload content not found for "${filename}".`) + throw new WorkspaceFileGrepError(`Upload content not found for "${firstSegment}".`) } const uploadsPath = `uploads/${canonicalUploadKey(record.name)}` return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts index 72eea0cefb9..a028e2b0b2a 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts @@ -9,19 +9,24 @@ const { getOrMaterializeVFS } = vi.hoisted(() => ({ getOrMaterializeVFS: vi.fn(), })) -const { readChatUpload, listChatUploads, grepChatUpload } = vi.hoisted(() => ({ - readChatUpload: vi.fn(), - listChatUploads: vi.fn(), - grepChatUpload: vi.fn(), -})) +const { readChatUploadPath, listChatUploads, grepChatUploadPath, listChatUploadArchiveEntries } = + vi.hoisted(() => ({ + readChatUploadPath: vi.fn(), + listChatUploads: vi.fn(), + grepChatUploadPath: vi.fn(), + // Defaults to null (not an archive) so archive glob expansion is a no-op + // unless a test opts in. + listChatUploadArchiveEntries: vi.fn().mockResolvedValue(null), + })) vi.mock('@/lib/copilot/vfs', () => ({ getOrMaterializeVFS, })) vi.mock('./upload-file-reader', () => ({ - readChatUpload, + readChatUploadPath, listChatUploads, - grepChatUpload, + grepChatUploadPath, + listChatUploadArchiveEntries, })) import { WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' @@ -305,7 +310,7 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { await executeVfsGrep({ pattern: 'secret' }, GREP_CTX_CHAT) - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() expect(vfs.grep).toHaveBeenCalledWith('secret', undefined, expect.any(Object)) }) @@ -316,11 +321,11 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { await executeVfsGrep({ pattern: 'secret', path: 'files/report.csv' }, GREP_CTX_CHAT) - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) - it('routes an explicit uploads/ path to grepChatUpload', async () => { - grepChatUpload.mockResolvedValue([{ path: 'uploads/report.json', line: 1, content: 'hit' }]) + it('routes an explicit uploads/ path to grepChatUploadPath', async () => { + grepChatUploadPath.mockResolvedValue([{ path: 'uploads/report.json', line: 1, content: 'hit' }]) const result = await executeVfsGrep( { pattern: 'hit', path: 'uploads/report.json' }, @@ -328,8 +333,9 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { ) expect(result.success).toBe(true) - expect(grepChatUpload).toHaveBeenCalledWith( + expect(grepChatUploadPath).toHaveBeenCalledWith( 'report.json', + '', 'chat-1', 'hit', expect.objectContaining({ maxResults: 50 }) @@ -342,7 +348,7 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect(result.success).toBe(false) expect(result.error).toContain('single upload') - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) it('errors when grepping uploads without chat context', async () => { @@ -350,11 +356,11 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect(result.success).toBe(false) expect(result.error).toContain('No chat context') - expect(grepChatUpload).not.toHaveBeenCalled() + expect(grepChatUploadPath).not.toHaveBeenCalled() }) it('surfaces an upload-not-found grep error verbatim', async () => { - grepChatUpload.mockRejectedValue( + grepChatUploadPath.mockRejectedValue( new WorkspaceFileGrepError( 'Upload not found: "ghost.json". Use glob("uploads/*") to list available uploads.' ) @@ -382,26 +388,115 @@ describe('vfs uploads are opt-in (like recently-deleted/)', () => { expect((broad.output as { files: string[] }).files).not.toContain('uploads/My%20Report.json') }) - it('reads an upload directly, tolerating a spurious /content suffix', async () => { + it('reads an upload directly, passing the first segment and any trailing suffix', async () => { const vfs = makeVfs() getOrMaterializeVFS.mockResolvedValue(vfs) - readChatUpload.mockResolvedValue({ content: 'hello upload', totalLines: 1 }) + readChatUploadPath.mockResolvedValue({ content: 'hello upload', totalLines: 1 }) const bare = await executeVfsRead({ path: 'uploads/report.csv' }, GREP_CTX_CHAT) expect(bare.success).toBe(true) - expect(readChatUpload).toHaveBeenLastCalledWith('report.csv', 'chat-1') + expect(readChatUploadPath).toHaveBeenLastCalledWith('report.csv', '', 'chat-1') - // The model adds /content out of habit (from files/) — it must still resolve. + // The model adds /content out of habit (from files/); the trailing segment is + // forwarded and ignored by readChatUploadPath for a non-archive upload. const withContent = await executeVfsRead({ path: 'uploads/report.csv/content' }, GREP_CTX_CHAT) expect(withContent.success).toBe(true) - expect(readChatUpload).toHaveBeenLastCalledWith('report.csv', 'chat-1') + expect(readChatUploadPath).toHaveBeenLastCalledWith('report.csv', 'content', 'chat-1') }) - it('tolerates a trailing /content on an uploads grep path', async () => { - grepChatUpload.mockResolvedValue([]) + it('forwards a trailing segment on an uploads grep path', async () => { + grepChatUploadPath.mockResolvedValue([]) await executeVfsGrep({ pattern: 'x', path: 'uploads/report.json/content' }, GREP_CTX_CHAT) - expect(grepChatUpload).toHaveBeenCalledWith('report.json', 'chat-1', 'x', expect.any(Object)) + expect(grepChatUploadPath).toHaveBeenCalledWith( + 'report.json', + 'content', + 'chat-1', + 'x', + expect.any(Object) + ) + }) +}) + +describe('vfs archive uploads (virtual folders)', () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it('expands a specific archive glob into its entry paths', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + listChatUploadArchiveEntries.mockResolvedValue([ + { path: 'report.pdf', vfsPath: 'uploads/bundle.zip/report.pdf' }, + { path: 'data/sheet.csv', vfsPath: 'uploads/bundle.zip/data/sheet.csv' }, + ]) + + const result = await executeVfsGlob({ pattern: 'uploads/bundle.zip/*' }, GREP_CTX_CHAT) + + expect(listChatUploadArchiveEntries).toHaveBeenCalledWith('bundle.zip', 'chat-1') + expect((result.output as { files: string[] }).files).toEqual( + expect.arrayContaining([ + 'uploads/bundle.zip', + 'uploads/bundle.zip/report.pdf', + 'uploads/bundle.zip/data/sheet.csv', + ]) + ) + }) + + it('does not expand archives for the broad uploads/* glob', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + + await executeVfsGlob({ pattern: 'uploads/*' }, GREP_CTX_CHAT) + + expect(listChatUploadArchiveEntries).not.toHaveBeenCalled() + }) + + it('forwards a nested archive entry read to readChatUploadPath', async () => { + readChatUploadPath.mockResolvedValue({ content: 'a,b\n1,2', totalLines: 2 }) + + const result = await executeVfsRead( + { path: 'uploads/bundle.zip/data/sheet.csv' }, + GREP_CTX_CHAT + ) + + expect(result.success).toBe(true) + expect(readChatUploadPath).toHaveBeenCalledWith('bundle.zip', 'data/sheet.csv', 'chat-1') + }) + + it('forwards a bare archive read to readChatUploadPath with no entry', async () => { + readChatUploadPath.mockResolvedValue({ + content: 'Archive "bundle.zip" — 2 files:\nreport.pdf\ndata/sheet.csv', + totalLines: 3, + }) + + const result = await executeVfsRead({ path: 'uploads/bundle.zip' }, GREP_CTX_CHAT) + + expect(result.success).toBe(true) + expect(readChatUploadPath).toHaveBeenCalledWith('bundle.zip', '', 'chat-1') + expect((result.output as { content: string }).content).toContain('Archive "bundle.zip"') + }) + + it('forwards a nested archive entry grep to grepChatUploadPath', async () => { + grepChatUploadPath.mockResolvedValue([ + { path: 'uploads/bundle.zip/notes.txt', line: 1, content: 'hit' }, + ]) + + const result = await executeVfsGrep( + { pattern: 'hit', path: 'uploads/bundle.zip/notes.txt' }, + GREP_CTX_CHAT + ) + + expect(result.success).toBe(true) + expect(grepChatUploadPath).toHaveBeenCalledWith( + 'bundle.zip', + 'notes.txt', + 'chat-1', + 'hit', + expect.any(Object) + ) }) }) diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.ts b/apps/sim/lib/copilot/tools/handlers/vfs.ts index ca1902692e1..71d5d86c3c7 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.ts @@ -6,7 +6,12 @@ import { getOrMaterializeVFS } from '@/lib/copilot/vfs' import type { GrepCountEntry, GrepMatch } from '@/lib/copilot/vfs/operations' import { WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' import { encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' -import { grepChatUpload, listChatUploads, readChatUpload } from './upload-file-reader' +import { + grepChatUploadPath, + listChatUploadArchiveEntries, + listChatUploads, + readChatUploadPath, +} from './upload-file-reader' const logger = createLogger('VfsTools') @@ -40,6 +45,21 @@ function isChatUploadGrepPath(path: string | undefined): path is string { return /^uploads(\/|$)/.test(path.replace(/^\/+/, '')) } +/** + * Extract the concrete archive segment a glob reaches into, e.g. `archive.zip` + * from `uploads/archive.zip/*`. Returns null for the broad `uploads/*` listing + * or when the first segment is itself a glob, so archives stay single leaves + * until the model globs inside one specifically. + */ +function parseArchiveGlobSegment(pattern: string): string | null { + const rest = pattern.replace(/^\/+/, '').replace(/^uploads\//, '') + const firstSlash = rest.indexOf('/') + if (firstSlash === -1) return null + const segment = rest.slice(0, firstSlash) + if (!segment || /[*?[\]{}]/.test(segment)) return null + return segment +} + function serializedResultSize(value: unknown): number { try { return JSON.stringify(value).length @@ -104,20 +124,29 @@ export async function executeVfsGrep( if (!context.chatId) { return { success: false, error: 'No chat context available for uploads/' } } - // The upload is the first segment after uploads/; any trailing segment - // (e.g. a /content suffix) is ignored, mirroring the uploads read path. - const filename = rawPath + // The upload is the first segment after uploads/. A further segment is + // either an archive entry (uploads//) or a habit suffix + // (e.g. a /content suffix), both handled by grepChatUploadPath. + const uploadSegments = rawPath .replace(/^\/+/, '') .replace(/^uploads\/?/, '') - .split('/')[0] - if (!filename) { + .split('/') + const firstSegment = uploadSegments[0] + const entryPath = uploadSegments.slice(1).join('/') + if (!firstSegment) { return { success: false, error: 'Grep over chat uploads must target a single upload (e.g. path: "uploads/report.json"). Use glob("uploads/*") to list uploads.', } } - result = await grepChatUpload(filename, context.chatId, pattern, grepOptions) + result = await grepChatUploadPath( + firstSegment, + entryPath, + context.chatId, + pattern, + grepOptions + ) } else { const vfs = await getOrMaterializeVFS(workspaceId, context.userId) result = isWorkspaceFileGrepPath(rawPath) @@ -185,6 +214,16 @@ export async function executeVfsGlob( // upload resolver accepts both the encoded path and the raw display name. const uploadPaths = uploads.map((f) => `uploads/${encodeUploadSegment(f.name)}`) files = [...files, ...uploadPaths] + + // Expand a specific archive's entries when the glob reaches inside it + // (uploads//*). Broad uploads/* keeps archives as single leaves. + const archiveSegment = parseArchiveGlobSegment(pattern) + if (archiveSegment) { + const entries = await listChatUploadArchiveEntries(archiveSegment, context.chatId) + if (entries) { + files = [...files, ...entries.map((entry) => entry.vfsPath)] + } + } } logger.debug('vfs_glob result', { pattern, fileCount: files.length }) @@ -243,8 +282,13 @@ export async function executeVfsRead( if (!context.chatId) { return { success: false, error: 'No chat context available for uploads/' } } - const filename = path.slice('uploads/'.length).split('/')[0] - const uploadResult = await readChatUpload(filename, context.chatId) + // The upload is the first segment after uploads/. A further segment is + // either an archive entry (uploads//) or a habit suffix + // (e.g. a /content suffix), both handled by readChatUploadPath. + const uploadSegments = path.slice('uploads/'.length).split('/') + const firstSegment = uploadSegments[0] + const entryPath = uploadSegments.slice(1).join('/') + const uploadResult = await readChatUploadPath(firstSegment, entryPath, context.chatId) if (uploadResult) { const isAttachment = hasModelAttachment(uploadResult) if ( diff --git a/apps/sim/lib/copilot/vfs/file-reader.test.ts b/apps/sim/lib/copilot/vfs/file-reader.test.ts index f4326b32035..7674e9d7fc9 100644 --- a/apps/sim/lib/copilot/vfs/file-reader.test.ts +++ b/apps/sim/lib/copilot/vfs/file-reader.test.ts @@ -15,7 +15,7 @@ vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({ fetchWorkspaceFileBuffer, })) -import { readFileRecord } from '@/lib/copilot/vfs/file-reader' +import { readFileRecord, renderFileBuffer } from '@/lib/copilot/vfs/file-reader' const MAX_IMAGE_READ_BYTES = 5 * 1024 * 1024 @@ -62,4 +62,74 @@ describe('readFileRecord', () => { }, SHARP_TEST_TIMEOUT_MS ) + + it('returns the binary placeholder for an unrenderable type WITHOUT downloading', async () => { + fetchWorkspaceFileBuffer.mockClear() + const result = await readFileRecord({ + id: 'wf_bin', + workspaceId: 'ws_1', + name: 'archive.bin', + key: 'uploads/archive.bin', + path: '/api/files/serve/uploads%2Farchive.bin?context=mothership', + size: 4_000_000_000, // 4 GB — must never be fetched into memory + type: 'application/octet-stream', + uploadedBy: 'user_1', + uploadedAt: new Date(), + deletedAt: null, + storageContext: 'mothership', + }) + + expect(result?.content).toContain('[Binary file: archive.bin') + expect(fetchWorkspaceFileBuffer).not.toHaveBeenCalled() + }) +}) + +describe('renderFileBuffer', () => { + it('renders readable text content verbatim with line counts', async () => { + const buffer = Buffer.from('line one\nline two\nline three') + const result = await renderFileBuffer(buffer, { + name: 'notes.txt', + type: 'text/plain', + ext: 'txt', + }) + expect(result.content).toBe('line one\nline two\nline three') + expect(result.totalLines).toBe(3) + expect(result.attachment).toBeUndefined() + }) + + it('renders csv and json by content type', async () => { + const csv = await renderFileBuffer(Buffer.from('a,b\n1,2'), { + name: 'data.csv', + type: 'text/csv', + ext: 'csv', + }) + expect(csv.content).toBe('a,b\n1,2') + + const json = await renderFileBuffer(Buffer.from('{"k":1}'), { + name: 'config.json', + type: 'application/json', + ext: 'json', + }) + expect(json.content).toBe('{"k":1}') + }) + + it('returns a binary placeholder for unrenderable types', async () => { + const result = await renderFileBuffer(Buffer.from([0x00, 0x01, 0x02, 0x03]), { + name: 'blob.dat', + type: 'application/octet-stream', + ext: 'dat', + }) + expect(result.content).toContain('[Binary file: blob.dat') + expect(result.attachment).toBeUndefined() + }) + + it('rejects oversized text without returning content', async () => { + const big = Buffer.alloc(MAX_IMAGE_READ_BYTES + 1, 0x61) // > 5MB of 'a' + const result = await renderFileBuffer(big, { + name: 'huge.txt', + type: 'text/plain', + ext: 'txt', + }) + expect(result.content).toContain('[File too large to display inline: huge.txt') + }) }) diff --git a/apps/sim/lib/copilot/vfs/file-reader.ts b/apps/sim/lib/copilot/vfs/file-reader.ts index 26388d5621a..9d3d112c418 100644 --- a/apps/sim/lib/copilot/vfs/file-reader.ts +++ b/apps/sim/lib/copilot/vfs/file-reader.ts @@ -274,6 +274,141 @@ export interface FileReadResult { } } +/** Placeholder returned when a text file exceeds the inline read budget. */ +function textTooLargeResult(name: string, size: number): FileReadResult { + return { + content: `[File too large to display inline: ${name} (${size} bytes, limit ${MAX_TEXT_READ_BYTES})]`, + totalLines: 1, + } +} + +/** Placeholder returned when a parseable document exceeds the inline parse budget. */ +function documentTooLargeResult(name: string, size: number): FileReadResult { + return { + content: `[Document too large to parse inline: ${name} (${size} bytes, limit ${MAX_PARSEABLE_READ_BYTES})]`, + totalLines: 1, + } +} + +/** Placeholder returned for a file whose bytes cannot be rendered as text. */ +function binaryPlaceholderResult(name: string, type: string, size: number): FileReadResult { + return { + content: `[Binary file: ${name} (${type}, ${size} bytes). Cannot display as text.]`, + totalLines: 1, + } +} + +/** True when a file is binary — not an image, not text, and not a parseable document. */ +function isBinaryFile(type: string, ext: string): boolean { + return !isImageFileType(type) && !isReadableType(type) && !PARSEABLE_EXTENSIONS.has(ext) +} + +/** + * Render an in-memory file buffer into a {@link FileReadResult} using the same + * image / text / parseable-document / binary logic as a stored upload. + * + * Pure aside from the optional `span`, which only carries read-path/outcome + * telemetry when called from {@link readFileRecord}; archive-entry reads pass no + * span. Size caps apply to the buffer length, so an inflated zip entry is bounded + * exactly like a stored file. + */ +export async function renderFileBuffer( + buffer: Buffer, + meta: { name: string; type: string; ext: string }, + span?: Span +): Promise { + const { name, type, ext } = meta + + if (isImageFileType(type)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Image) + const prepared = await prepareImageForVision(buffer, type) + if (!prepared) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ImageTooLarge) + return { + content: `[Image too large: ${name} (${(buffer.length / 1024 / 1024).toFixed(1)}MB, limit 5MB after resize/compression)]`, + totalLines: 1, + } + } + const sizeKb = (prepared.buffer.length / 1024).toFixed(1) + const resizeNote = prepared.resized ? ', resized for vision' : '' + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.ImagePrepared, + [TraceAttr.CopilotVfsReadOutputBytes]: prepared.buffer.length, + [TraceAttr.CopilotVfsReadOutputMediaType]: prepared.mediaType, + [TraceAttr.CopilotVfsReadImageResized]: prepared.resized, + }) + return { + content: `Image: ${name} (${sizeKb}KB, ${prepared.mediaType}${resizeNote})`, + totalLines: 1, + attachment: { + type: 'image', + name, + source: { + type: 'base64' as const, + media_type: prepared.mediaType, + data: prepared.buffer.toString('base64'), + }, + }, + } + } + + if (isReadableType(type)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Text) + if (buffer.length > MAX_TEXT_READ_BYTES) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) + return textTooLargeResult(name, buffer.length) + } + const content = buffer.toString('utf-8') + const lines = content.split('\n').length + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.TextRead, + [TraceAttr.CopilotVfsReadOutputBytes]: buffer.length, + [TraceAttr.CopilotVfsReadOutputLines]: lines, + }) + return { content, totalLines: lines } + } + + if (PARSEABLE_EXTENSIONS.has(ext)) { + span?.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.ParseableDocument) + if (buffer.length > MAX_PARSEABLE_READ_BYTES) { + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.DocumentTooLarge) + return documentTooLargeResult(name, buffer.length) + } + try { + const { parseBuffer } = await import('@/lib/file-parsers') + const result = await parseBuffer(buffer, ext) + const content = result.content || '' + const lines = content.split('\n').length + span?.setAttributes({ + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.DocumentParsed, + [TraceAttr.CopilotVfsReadOutputBytes]: content.length, + [TraceAttr.CopilotVfsReadOutputLines]: lines, + }) + return { content, totalLines: lines } + } catch (parseErr) { + logger.warn('Failed to parse document', { + fileName: name, + ext, + error: toError(parseErr).message, + }) + span?.addEvent(TraceEvent.CopilotVfsParseFailed, { + [TraceAttr.ErrorMessage]: toError(parseErr).message.slice(0, 500), + }) + span?.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ParseFailed) + return { + content: `[Could not parse ${name} (${type}, ${buffer.length} bytes)]`, + totalLines: 1, + } + } + } + + span?.setAttributes({ + [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, + }) + return binaryPlaceholderResult(name, type, buffer.length) +} + /** * Read and return the content of a workspace file record. * Handles images (base64 attachment), parseable documents (PDF, DOCX, etc.), @@ -298,111 +433,35 @@ export async function readFileRecord(record: WorkspaceFileRecord): Promise { try { - if (isImageFileType(record.type)) { - span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Image) - const originalBuffer = await fetchWorkspaceFileBuffer(record) - const prepared = await prepareImageForVision(originalBuffer, record.type) - if (!prepared) { - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ImageTooLarge) - return { - content: `[Image too large: ${record.name} (${(record.size / 1024 / 1024).toFixed(1)}MB, limit 5MB after resize/compression)]`, - totalLines: 1, - } - } - const sizeKb = (prepared.buffer.length / 1024).toFixed(1) - const resizeNote = prepared.resized ? ', resized for vision' : '' - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.ImagePrepared, - [TraceAttr.CopilotVfsReadOutputBytes]: prepared.buffer.length, - [TraceAttr.CopilotVfsReadOutputMediaType]: prepared.mediaType, - [TraceAttr.CopilotVfsReadImageResized]: prepared.resized, - }) - return { - content: `Image: ${record.name} (${sizeKb}KB, ${prepared.mediaType}${resizeNote})`, - totalLines: 1, - attachment: { - type: 'image', - name: record.name, - source: { - type: 'base64' as const, - media_type: prepared.mediaType, - data: prepared.buffer.toString('base64'), - }, - }, - } - } - - if (isReadableType(record.type)) { + const ext = getExtension(record.name) + // Pre-fetch size guards: reject oversized text/parseable files without + // paying for the download. Images are always fetched (to sniff + resize). + if (isReadableType(record.type) && record.size > MAX_TEXT_READ_BYTES) { span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.Text) - if (record.size > MAX_TEXT_READ_BYTES) { - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) - return { - content: `[File too large to display inline: ${record.name} (${record.size} bytes, limit ${MAX_TEXT_READ_BYTES})]`, - totalLines: 1, - } - } - - const buffer = await fetchWorkspaceFileBuffer(record) - const content = buffer.toString('utf-8') - const lines = content.split('\n').length - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.TextRead, - [TraceAttr.CopilotVfsReadOutputBytes]: buffer.length, - [TraceAttr.CopilotVfsReadOutputLines]: lines, - }) - return { content, totalLines: lines } + span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.TextTooLarge) + return textTooLargeResult(record.name, record.size) } - - const ext = getExtension(record.name) - if (PARSEABLE_EXTENSIONS.has(ext)) { + if ( + !isImageFileType(record.type) && + !isReadableType(record.type) && + PARSEABLE_EXTENSIONS.has(ext) && + record.size > MAX_PARSEABLE_READ_BYTES + ) { span.setAttribute(TraceAttr.CopilotVfsReadPath, CopilotVfsReadPath.ParseableDocument) - if (record.size > MAX_PARSEABLE_READ_BYTES) { - span.setAttribute( - TraceAttr.CopilotVfsReadOutcome, - CopilotVfsReadOutcome.DocumentTooLarge - ) - return { - content: `[Document too large to parse inline: ${record.name} (${record.size} bytes, limit ${MAX_PARSEABLE_READ_BYTES})]`, - totalLines: 1, - } - } - const buffer = await fetchWorkspaceFileBuffer(record) - try { - const { parseBuffer } = await import('@/lib/file-parsers') - const result = await parseBuffer(buffer, ext) - const content = result.content || '' - const lines = content.split('\n').length - span.setAttributes({ - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.DocumentParsed, - [TraceAttr.CopilotVfsReadOutputBytes]: content.length, - [TraceAttr.CopilotVfsReadOutputLines]: lines, - }) - return { content, totalLines: lines } - } catch (parseErr) { - logger.warn('Failed to parse document', { - fileName: record.name, - ext, - error: toError(parseErr).message, - }) - span.addEvent(TraceEvent.CopilotVfsParseFailed, { - [TraceAttr.ErrorMessage]: toError(parseErr).message.slice(0, 500), - }) - span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.ParseFailed) - return { - content: `[Could not parse ${record.name} (${record.type}, ${record.size} bytes)]`, - totalLines: 1, - } - } + span.setAttribute(TraceAttr.CopilotVfsReadOutcome, CopilotVfsReadOutcome.DocumentTooLarge) + return documentTooLargeResult(record.name, record.size) } - - span.setAttributes({ - [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, - [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, - }) - return { - content: `[Binary file: ${record.name} (${record.type}, ${record.size} bytes). Cannot display as text.]`, - totalLines: 1, + // Binary/unknown types never need the bytes — return the placeholder + // without paying for a download (workspace files can be multi-GB). + if (isBinaryFile(record.type, ext)) { + span.setAttributes({ + [TraceAttr.CopilotVfsReadPath]: CopilotVfsReadPath.Binary, + [TraceAttr.CopilotVfsReadOutcome]: CopilotVfsReadOutcome.BinaryPlaceholder, + }) + return binaryPlaceholderResult(record.name, record.type, record.size) } + const buffer = await fetchWorkspaceFileBuffer(record) + return await renderFileBuffer(buffer, { name: record.name, type: record.type, ext }, span) } catch (err) { logger.warn('Failed to read workspace file', { fileName: record.name, diff --git a/apps/sim/lib/uploads/archive.test.ts b/apps/sim/lib/uploads/archive.test.ts new file mode 100644 index 00000000000..a3acc55d97b --- /dev/null +++ b/apps/sim/lib/uploads/archive.test.ts @@ -0,0 +1,123 @@ +/** + * @vitest-environment node + */ +import { Buffer } from 'buffer' +import JSZip from 'jszip' +import { describe, expect, it } from 'vitest' +import { + ArchiveError, + extractArchiveEntry, + listArchiveEntries, + MAX_ARCHIVE_ENTRIES, +} from '@/lib/uploads/archive' + +async function buildZip(files: Record): Promise { + const zip = new JSZip() + for (const [name, content] of Object.entries(files)) { + zip.file(name, content) + } + const arr = await zip.generateAsync({ type: 'uint8array' }) + return Buffer.from(arr) +} + +describe('listArchiveEntries', () => { + it('enumerates nested entries with sanitized joined paths', async () => { + const buffer = await buildZip({ + 'report.txt': 'hello', + 'data/sheet.csv': 'a,b\n1,2', + 'data/nested/deep.json': '{}', + }) + + const paths = (await listArchiveEntries(buffer)).sort() + + expect(paths).toEqual(['data/nested/deep.json', 'data/sheet.csv', 'report.txt']) + }) + + it('skips directory entries', async () => { + const zip = new JSZip() + zip.folder('emptydir') + zip.file('file.txt', 'x') + const buffer = Buffer.from(await zip.generateAsync({ type: 'uint8array' })) + + expect(await listArchiveEntries(buffer)).toEqual(['file.txt']) + }) + + it('never surfaces a path with a traversal segment or absolute root', async () => { + // JSZip itself strips leading `../`, keeping a contained basename; our guard + // additionally rejects any residual `..` (e.g. a Windows-style backslash path + // that JSZip stores verbatim) so nothing can escape the archive root. + const buffer = await buildZip({ + 'safe.txt': 'ok', + '..\\evil.txt': 'evil', + 'sub\\..\\..\\evil2.txt': 'evil', + }) + + const paths = await listArchiveEntries(buffer) + + expect(paths).toContain('safe.txt') + expect(paths.some((p) => p.split('/').includes('..'))).toBe(false) + expect(paths.some((p) => p.startsWith('/'))).toBe(false) + expect(paths).not.toContain('evil.txt') + expect(paths).not.toContain('evil2.txt') + }) + + it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => { + const buffer = await buildZip({ + 'doc.txt': 'real', + '__MACOSX/._doc.txt': 'junk', + '.DS_Store': 'junk', + 'sub/.DS_Store': 'junk', + 'sub/Thumbs.db': 'junk', + }) + + expect(await listArchiveEntries(buffer)).toEqual(['doc.txt']) + }) + + it('rejects archives with too many entries', async () => { + const files: Record = {} + for (let i = 0; i <= MAX_ARCHIVE_ENTRIES; i++) { + files[`f${i}.txt`] = 'x' + } + const buffer = await buildZip(files) + + await expect(listArchiveEntries(buffer)).rejects.toMatchObject({ + name: 'ArchiveError', + reason: 'too_many_entries', + }) + }) + + it('throws ArchiveError invalid for non-zip buffers', async () => { + await expect(listArchiveEntries(Buffer.from('not a zip at all'))).rejects.toBeInstanceOf( + ArchiveError + ) + }) +}) + +describe('extractArchiveEntry', () => { + it('extracts a single entry by sanitized path', async () => { + const buffer = await buildZip({ + 'report.txt': 'the body', + 'data/sheet.csv': 'a,b\n1,2', + }) + + const csv = await extractArchiveEntry(buffer, 'data/sheet.csv') + expect(csv?.toString('utf-8')).toBe('a,b\n1,2') + + const txt = await extractArchiveEntry(buffer, 'report.txt') + expect(txt?.toString('utf-8')).toBe('the body') + }) + + it('returns null when the entry does not exist', async () => { + const buffer = await buildZip({ 'report.txt': 'x' }) + expect(await extractArchiveEntry(buffer, 'missing.txt')).toBeNull() + }) + + it('does not resolve traversal paths', async () => { + const buffer = await buildZip({ '..\\evil.txt': 'evil', 'safe.txt': 'ok' }) + // The traversal entry sanitizes to null, so it is unmatchable by any path. + expect(await extractArchiveEntry(buffer, '../evil.txt')).toBeNull() + expect(await extractArchiveEntry(buffer, '..\\evil.txt')).toBeNull() + expect(await extractArchiveEntry(buffer, 'evil.txt')).toBeNull() + expect((await extractArchiveEntry(buffer, 'safe.txt'))?.toString('utf-8')).toBe('ok') + }) +}) diff --git a/apps/sim/lib/uploads/archive.ts b/apps/sim/lib/uploads/archive.ts new file mode 100644 index 00000000000..590e63c7c9a --- /dev/null +++ b/apps/sim/lib/uploads/archive.ts @@ -0,0 +1,209 @@ +import { Buffer } from 'buffer' +import type { Readable } from 'stream' +import JSZip from 'jszip' + +/** + * Shared, zip-bomb / zip-slip safe archive primitives. + * + * These were originally inlined in the file-manage decompress route; they are + * factored here so the copilot VFS can present an uploaded `.zip` as a virtual + * folder (list entries, extract one entry on read) using the exact same safety + * guarantees. The declared sizes in a ZIP header are attacker-controlled, so the + * real caps are always enforced on the inflated byte stream — never on metadata. + */ + +/** Input archive download/size cap. */ +export const MAX_ARCHIVE_BYTES = 100 * 1024 * 1024 +/** Maximum number of entries enumerated/extracted from a single archive. */ +export const MAX_ARCHIVE_ENTRIES = 1000 +/** Maximum uncompressed size for any single archive entry. */ +export const MAX_ARCHIVE_ENTRY_BYTES = 100 * 1024 * 1024 +/** Maximum total uncompressed size across all entries, to bound zip-bomb expansion. */ +export const MAX_ARCHIVE_TOTAL_BYTES = 200 * 1024 * 1024 + +const S_IFMT = 0o170000 +const S_IFLNK = 0o120000 + +/** Reason a {@link ArchiveError} was raised, for mapping to a caller response. */ +export type ArchiveErrorReason = + | 'invalid' + | 'too_many_entries' + | 'entry_too_large' + | 'total_too_large' + +/** Raised for malformed archives and cap violations so callers can surface a clear message. */ +export class ArchiveError extends Error { + readonly reason: ArchiveErrorReason + readonly entryName?: string + + constructor(reason: ArchiveErrorReason, message: string, entryName?: string) { + super(message) + this.name = 'ArchiveError' + this.reason = reason + this.entryName = entryName + } +} + +/** + * Read a zip entry's declared uncompressed size without materializing it. This + * value comes straight from the (attacker-controlled) ZIP metadata, so it is only + * usable as a cheap fast-reject for honestly-declared archives — never as the + * authoritative cap. {@link inflateEntryWithinCaps} enforces the real limit on the + * inflated byte stream. + */ +export const readEntryUncompressedSize = (entry: JSZip.JSZipObject): number | undefined => { + const data = (entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } })._data + const size = data?.uncompressedSize + return typeof size === 'number' && Number.isFinite(size) ? size : undefined +} + +type InflateResult = { ok: true; buffer: Buffer } | { ok: false; reason: 'entry' | 'total' } + +/** + * Inflate a single zip entry through a streaming counting sink, tearing the + * stream down the moment cumulative output would exceed the per-entry cap or the + * remaining total budget. The declared uncompressed size in the ZIP header is + * attacker-controlled and is NOT trusted here: a forged-small or absent size + * cannot cause the full (potentially gigabyte-scale) entry to be materialized in + * memory, because enforcement happens on the actual inflated bytes as they + * arrive. Peak memory is bounded by the cap plus one DEFLATE chunk. + */ +export const inflateEntryWithinCaps = ( + entry: JSZip.JSZipObject, + remainingTotalBudget: number +): Promise => + new Promise((resolve, reject) => { + const chunks: Buffer[] = [] + let size = 0 + let settled = false + const stream = entry.nodeStream() as Readable + + const settle = (result: InflateResult) => { + if (settled) return + settled = true + stream.destroy() + resolve(result) + } + + stream.on('data', (chunk: Buffer) => { + size += chunk.length + if (size > MAX_ARCHIVE_ENTRY_BYTES) { + settle({ ok: false, reason: 'entry' }) + return + } + if (size > remainingTotalBudget) { + settle({ ok: false, reason: 'total' }) + return + } + chunks.push(chunk) + }) + stream.on('end', () => settle({ ok: true, buffer: Buffer.concat(chunks, size) })) + stream.on('error', (error) => { + if (settled) return + settled = true + stream.destroy() + reject(error) + }) + }) + +/** True when a zip entry's unix mode marks it as a symlink (never extracted). */ +export const isSymlinkEntry = (entry: JSZip.JSZipObject): boolean => { + const mode = (entry as JSZip.JSZipObject & { unixPermissions?: number | null }).unixPermissions + return typeof mode === 'number' && (mode & S_IFMT) === S_IFLNK +} + +/** + * Normalize a zip entry path into safe path segments, guarding against zip-slip. + * Returns null for traversal (`..`) and empty paths; a leading slash or drive root + * is dropped to empty segments, so the entry stays relative (contained) rather + * than resolving outside its intended location. + */ +export const sanitizeArchiveEntryPath = (rawPath: string): string[] | null => { + const segments = rawPath + .replace(/\\/g, '/') + .split('/') + .map((segment) => segment.trim()) + .filter((segment) => segment.length > 0 && segment !== '.') + + if (segments.length === 0 || segments.includes('..')) return null + return segments +} + +/** Filesystem cruft that should never surface as a readable archive entry. */ +const isArchiveNoiseEntry = (segments: string[]): boolean => { + if (segments[0] === '__MACOSX') return true + const leaf = segments[segments.length - 1] + return leaf === '.DS_Store' || leaf === 'Thumbs.db' +} + +/** + * Parse an archive buffer, throwing {@link ArchiveError} with reason `invalid` + * when it is not a readable zip. + */ +async function loadArchive(buffer: Buffer): Promise { + try { + return await JSZip.loadAsync(buffer) + } catch { + throw new ArchiveError('invalid', 'Not a valid .zip archive') + } +} + +/** + * Enumerate the safe, extractable entry paths of an archive WITHOUT inflating + * them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips + * directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`, + * `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past + * {@link MAX_ARCHIVE_ENTRIES}. + */ +export async function listArchiveEntries(buffer: Buffer): Promise { + const zip = await loadArchive(buffer) + + const realEntries = Object.values(zip.files).filter( + (entry) => !entry.dir && !isSymlinkEntry(entry) + ) + if (realEntries.length > MAX_ARCHIVE_ENTRIES) { + throw new ArchiveError( + 'too_many_entries', + `Archive has too many entries. Maximum is ${MAX_ARCHIVE_ENTRIES}.` + ) + } + + const paths: string[] = [] + for (const entry of realEntries) { + const segments = sanitizeArchiveEntryPath(entry.name) + if (!segments || isArchiveNoiseEntry(segments)) continue + paths.push(segments.join('/')) + } + return paths +} + +/** + * Extract a single archive entry by its sanitized `/`-joined path, inflating + * within the per-entry cap. Returns `null` when no entry matches. Throws + * {@link ArchiveError} `entry_too_large` if the inflated bytes exceed the cap. + */ +export async function extractArchiveEntry( + buffer: Buffer, + entryPath: string +): Promise { + const zip = await loadArchive(buffer) + + const match = Object.values(zip.files).find((entry) => { + if (entry.dir || isSymlinkEntry(entry)) return false + const segments = sanitizeArchiveEntryPath(entry.name) + return segments !== null && segments.join('/') === entryPath + }) + if (!match) return null + + const result = await inflateEntryWithinCaps(match, MAX_ARCHIVE_ENTRY_BYTES) + if (!result.ok) { + throw new ArchiveError( + 'entry_too_large', + `Archive entry "${entryPath}" is too large to extract. Maximum is ${ + MAX_ARCHIVE_ENTRY_BYTES / (1024 * 1024) + } MB per file.`, + entryPath + ) + } + return result.buffer +} diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts index 0fd254f2e25..dc7f5b779bd 100644 --- a/apps/sim/lib/uploads/utils/file-utils.ts +++ b/apps/sim/lib/uploads/utils/file-utils.ts @@ -1,7 +1,11 @@ import type { Logger } from '@sim/logger' import { omit } from '@sim/utils/object' import type { StorageContext } from '@/lib/uploads' -import { ACCEPTED_FILE_TYPES, SUPPORTED_DOCUMENT_EXTENSIONS } from '@/lib/uploads/utils/validation' +import { + ACCEPTED_FILE_TYPES, + SUPPORTED_ARCHIVE_EXTENSIONS, + SUPPORTED_DOCUMENT_EXTENSIONS, +} from '@/lib/uploads/utils/validation' import { isUuid } from '@/executor/constants' import type { UserFile } from '@/executor/types' @@ -206,6 +210,16 @@ export function getFileExtension(filename: string): string { return lastDot !== -1 ? filename.slice(lastDot + 1).toLowerCase() : '' } +const ARCHIVE_EXTENSIONS = new Set(SUPPORTED_ARCHIVE_EXTENSIONS) + +/** + * True when a file name is a supported archive (zip). Detection is by extension + * so it is robust to the varied/empty MIME types browsers assign to archives. + */ +export function isArchiveFileName(filename: string): boolean { + return ARCHIVE_EXTENSIONS.has(getFileExtension(filename)) +} + const EXTENSION_TO_MIME: Record = { // Images jpg: 'image/jpeg', diff --git a/apps/sim/lib/uploads/utils/validation.test.ts b/apps/sim/lib/uploads/utils/validation.test.ts index 9d5d31ea1d6..c2b8daeb416 100644 --- a/apps/sim/lib/uploads/utils/validation.test.ts +++ b/apps/sim/lib/uploads/utils/validation.test.ts @@ -71,11 +71,20 @@ describe('validateAttachmentFileType', () => { expect(validateAttachmentFileType('config.json')).toBeNull() }) - it('rejects executables and unknown extensions', () => { + it('accepts zip archives', () => { + expect(validateAttachmentFileType('mydata.zip')).toBeNull() + expect(validateAttachmentFileType('mydata~1782582468496.zip')).toBeNull() + expect(validateAttachmentFileType('UPPER.ZIP')).toBeNull() + }) + + it('rejects executables and other archive formats we do not extract', () => { expect(validateAttachmentFileType('virus.exe')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('installer.msi')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('archive.dmg')?.code).toBe('UNSUPPORTED_FILE_TYPE') expect(validateAttachmentFileType('binary.bin')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.tar')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.gz')?.code).toBe('UNSUPPORTED_FILE_TYPE') + expect(validateAttachmentFileType('bundle.rar')?.code).toBe('UNSUPPORTED_FILE_TYPE') }) it('rejects files with no extension', () => { diff --git a/apps/sim/lib/uploads/utils/validation.ts b/apps/sim/lib/uploads/utils/validation.ts index b4e27684f63..757b76a7acd 100644 --- a/apps/sim/lib/uploads/utils/validation.ts +++ b/apps/sim/lib/uploads/utils/validation.ts @@ -95,6 +95,14 @@ export const SUPPORTED_AUDIO_EXTENSIONS = [ export const SUPPORTED_VIDEO_EXTENSIONS = ['mp4', 'mov', 'avi', 'mkv', 'webm'] as const +/** + * Archive formats accepted as chat attachments. A `.zip` is stored as a single + * object and presented to the agent as a virtual folder it can list and read + * entry-by-entry — extraction happens lazily on read in the copilot VFS, so the + * archive itself never needs a document parser here. + */ +export const SUPPORTED_ARCHIVE_EXTENSIONS = ['zip'] as const + export const SUPPORTED_IMAGE_EXTENSIONS = [ 'png', 'jpg', @@ -207,10 +215,18 @@ const SUPPORTED_IMAGE_MIME_TYPES = [ 'image/vnd.microsoft.icon', ] +const SUPPORTED_ARCHIVE_MIME_TYPES = [ + 'application/zip', + 'application/x-zip-compressed', + 'application/x-zip', +] + export const CHAT_ACCEPT_ATTRIBUTE = [ ACCEPT_ATTRIBUTE, ...SUPPORTED_IMAGE_MIME_TYPES, ...SUPPORTED_IMAGE_EXTENSIONS.map((ext) => `.${ext}`), + ...SUPPORTED_ARCHIVE_MIME_TYPES, + ...SUPPORTED_ARCHIVE_EXTENSIONS.map((ext) => `.${ext}`), ].join(',') export interface FileValidationError { @@ -226,14 +242,16 @@ export const SUPPORTED_ATTACHMENT_EXTENSIONS = Array.from( ...SUPPORTED_IMAGE_EXTENSIONS, ...SUPPORTED_AUDIO_EXTENSIONS, ...SUPPORTED_VIDEO_EXTENSIONS, + ...SUPPORTED_ARCHIVE_EXTENSIONS, ]) ) as readonly string[] /** * Validate that a file's extension is allowed as a chat/mothership attachment. * - * Permits documents, code, images, audio, and video — anything users would - * reasonably attach to a chat message. Rejects executables and unknown types. + * Permits documents, code, images, audio, video, and zip archives — anything + * users would reasonably attach to a chat message. Rejects executables and + * unknown types. */ export function validateAttachmentFileType(fileName: string): FileValidationError | null { const raw = extractExtension(fileName) @@ -242,7 +260,7 @@ export function validateAttachmentFileType(fileName: string): FileValidationErro if (!SUPPORTED_ATTACHMENT_EXTENSIONS.includes(extension)) { return { code: 'UNSUPPORTED_FILE_TYPE', - message: `Unsupported file type${extension ? `: ${extension}` : ` for "${fileName}"`}. Supported types include documents, code, images, audio, and video.`, + message: `Unsupported file type${extension ? `: ${extension}` : ` for "${fileName}"`}. Supported types include documents, code, images, audio, video, and zip archives.`, supportedTypes: [...SUPPORTED_ATTACHMENT_EXTENSIONS], } } From 1f729ef43eff5c91bfeffd29bc76b436a6644c69 Mon Sep 17 00:00:00 2001 From: waleed Date: Sat, 27 Jun 2026 18:59:50 -0700 Subject: [PATCH 2/7] fix(chat): cap archive read size, manifest-fallback on miss, dedupe entries Address review findings on the zip-upload feature: - guard archive list/read/grep on record.size > MAX_ARCHIVE_BYTES before downloading, so an oversized zip is never buffered into memory - a not-found archive entry now returns the file-tree manifest with a note (handles a stray /content habit suffix and typos) instead of failing - de-duplicate archive entries that sanitize to the same path (./a/b vs a/b) --- .../tools/handlers/upload-file-reader.test.ts | 23 ++++++- .../tools/handlers/upload-file-reader.ts | 68 ++++++++++++++++--- apps/sim/lib/uploads/archive.test.ts | 11 +++ apps/sim/lib/uploads/archive.ts | 11 ++- 4 files changed, 98 insertions(+), 15 deletions(-) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index 4c4b14d9d6f..1343e876df9 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -252,14 +252,31 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { expect(result?.content).toBe('latte') }) - it('returns null for an entry that is not in the archive', async () => { + it('falls back to the manifest (with a note) when the entry is not found', async () => { const buffer = await buildZip({ 'present.txt': 'x' }) mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) - const result = await readChatUploadPath('bundle.zip', 'missing.txt', CHAT_ID) + // Covers the /content habit suffix and plain typos uniformly. + const result = await readChatUploadPath('bundle.zip', 'content', CHAT_ID) - expect(result).toBeNull() + expect(result?.content).toContain('Entry "content" not found in "bundle.zip"') + expect(result?.content).toContain('present.txt') + }) + + it('rejects an oversized archive WITHOUT downloading it', async () => { + mockOrderByThenLimit([ + makeRow({ + displayName: 'huge.zip', + contentType: 'application/zip', + size: 200 * 1024 * 1024, // 200MB > 100MB cap + }), + ]) + + const result = await readChatUploadPath('huge.zip', 'anything.txt', CHAT_ID) + + expect(result?.content).toContain('[Archive too large to read: huge.zip') + expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled() }) it('returns the file-tree manifest for a bare archive read', async () => { diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 6d919be7e08..c86167f7e90 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -17,7 +17,12 @@ import { } from '@/lib/copilot/vfs/operations' import { decodeVfsSegment, encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { getServePathPrefix } from '@/lib/uploads' -import { ArchiveError, extractArchiveEntry, listArchiveEntries } from '@/lib/uploads/archive' +import { + ArchiveError, + extractArchiveEntry, + listArchiveEntries, + MAX_ARCHIVE_BYTES, +} from '@/lib/uploads/archive' import { fetchWorkspaceFileBuffer, type WorkspaceFileRecord, @@ -160,6 +165,26 @@ export function isArchiveUpload(record: WorkspaceFileRecord): boolean { return isArchiveFileName(record.name) } +/** + * True when an archive's stored size exceeds the read cap, so it must not be + * downloaded + parsed inline. Checked against `record.size` BEFORE fetching so an + * oversized archive never gets buffered into memory (the decompress tool applies + * the same {@link MAX_ARCHIVE_BYTES} cap on its own download path). + */ +function exceedsArchiveReadCap(record: WorkspaceFileRecord): boolean { + return record.size > MAX_ARCHIVE_BYTES +} + +/** Placeholder for an archive too large to download and extract inline. */ +function archiveTooLargeResult(record: WorkspaceFileRecord): FileReadResult { + return { + content: `[Archive too large to read: ${record.name} (${Math.round( + record.size / 1024 / 1024 + )}MB, limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB)]`, + totalLines: 1, + } +} + /** Decode each `/`-separated segment of a VFS entry path back to its real name. */ function decodeEntryPath(raw: string): string { return raw @@ -233,6 +258,10 @@ export async function listChatUploadArchiveEntries( if (!row) return null const record = toWorkspaceFileRecord(row) if (!isArchiveUpload(record)) return null + if (exceedsArchiveReadCap(record)) { + logger.warn('Archive too large to list entries', { zipName, chatId, size: record.size }) + return [] + } const encodedZip = canonicalUploadKey(record.name) try { @@ -282,13 +311,16 @@ async function readArchiveEntry( } /** - * Build a file-tree manifest for a bare archive read (`read("uploads/x.zip")`), - * so the agent gets the contents instead of binary bytes. Returns a placeholder - * result when the archive is unreadable. + * Build a file-tree manifest for an archive (`read("uploads/x.zip")`), so the + * agent gets the contents instead of binary bytes. An optional `note` is + * prepended — used to tell the agent a requested entry was not found while still + * showing the valid paths. Returns a placeholder result when the archive is + * unreadable. */ async function buildArchiveManifest( record: WorkspaceFileRecord, - archiveBuffer: Buffer + archiveBuffer: Buffer, + note?: string ): Promise { const encodedZip = canonicalUploadKey(record.name) try { @@ -296,7 +328,7 @@ async function buildArchiveManifest( const header = `Archive "${record.name}" — ${entries.length} file${ entries.length === 1 ? '' : 's' }. Read an entry with read("uploads/${encodedZip}/").` - const content = [header, '', ...entries].join('\n') + const content = [...(note ? [note, ''] : []), header, '', ...entries].join('\n') return { content, totalLines: content.split('\n').length } } catch (err) { if (err instanceof ArchiveError) { @@ -326,10 +358,23 @@ export async function readChatUploadPath( if (!isArchiveUpload(record)) { return await readFileRecord(record) } + if (exceedsArchiveReadCap(record)) { + return archiveTooLargeResult(record) + } const archiveBuffer = await fetchWorkspaceFileBuffer(record) - return entryPath - ? await readArchiveEntry(archiveBuffer, entryPath) - : await buildArchiveManifest(record, archiveBuffer) + if (!entryPath) { + return await buildArchiveManifest(record, archiveBuffer) + } + const entry = await readArchiveEntry(archiveBuffer, entryPath) + if (entry) return entry + // Entry not found — show the manifest so the agent can pick a valid path. + // Handles a stray `/content` habit suffix (carried over from files/) and + // plain typos uniformly, without special-casing any segment name. + return await buildArchiveManifest( + record, + archiveBuffer, + `Entry "${decodeEntryPath(entryPath)}" not found in "${record.name}".` + ) } catch (err) { logger.warn('Failed to read chat upload', { firstSegment, @@ -366,6 +411,11 @@ export async function grepChatUploadPath( const record = toWorkspaceFileRecord(row) if (entryPath && isArchiveUpload(record)) { + if (exceedsArchiveReadCap(record)) { + throw new WorkspaceFileGrepError( + `Archive too large to grep: "${record.name}" (limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB).` + ) + } const archiveBuffer = await fetchWorkspaceFileBuffer(record) const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) if (!rawPath) { diff --git a/apps/sim/lib/uploads/archive.test.ts b/apps/sim/lib/uploads/archive.test.ts index a3acc55d97b..63e2d36aa52 100644 --- a/apps/sim/lib/uploads/archive.test.ts +++ b/apps/sim/lib/uploads/archive.test.ts @@ -61,6 +61,17 @@ describe('listArchiveEntries', () => { expect(paths).not.toContain('evil2.txt') }) + it('de-duplicates entries that sanitize to the same path', async () => { + const buffer = await buildZip({ + 'a/b.txt': 'first', + './a/b.txt': 'shadowed', + }) + + const paths = await listArchiveEntries(buffer) + + expect(paths).toEqual(['a/b.txt']) + }) + it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => { const buffer = await buildZip({ 'doc.txt': 'real', diff --git a/apps/sim/lib/uploads/archive.ts b/apps/sim/lib/uploads/archive.ts index 590e63c7c9a..807feced632 100644 --- a/apps/sim/lib/uploads/archive.ts +++ b/apps/sim/lib/uploads/archive.ts @@ -152,8 +152,9 @@ async function loadArchive(buffer: Buffer): Promise { * Enumerate the safe, extractable entry paths of an archive WITHOUT inflating * them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips * directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`, - * `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past - * {@link MAX_ARCHIVE_ENTRIES}. + * `.DS_Store`, `Thumbs.db`), and de-duplicates entries that sanitize to the same + * path (e.g. `./a/b` and `a/b`) since only the first is extractable by path. + * Throws {@link ArchiveError} `too_many_entries` past {@link MAX_ARCHIVE_ENTRIES}. */ export async function listArchiveEntries(buffer: Buffer): Promise { const zip = await loadArchive(buffer) @@ -168,11 +169,15 @@ export async function listArchiveEntries(buffer: Buffer): Promise { ) } + const seen = new Set() const paths: string[] = [] for (const entry of realEntries) { const segments = sanitizeArchiveEntryPath(entry.name) if (!segments || isArchiveNoiseEntry(segments)) continue - paths.push(segments.join('/')) + const path = segments.join('/') + if (seen.has(path)) continue + seen.add(path) + paths.push(path) } return paths } From 5ea26995d89fbe61efe652c06cafc652ce817be5 Mon Sep 17 00:00:00 2001 From: waleed Date: Sat, 27 Jun 2026 19:03:43 -0700 Subject: [PATCH 3/7] fix(chat): resolve uploads whose name contains a literal percent A name like test%2A.zip is exposed double-encoded by glob/upload-context (test%252A.zip) but canonicalUploadKey decodes the input first, so a literal %2A is indistinguishable from an encoded * and the lookup misses. Add an encoded-form fallback (encode the stored name, compare to the raw input) which recovers the row without affecting the U+202F normalization path. --- .../tools/handlers/upload-file-reader.test.ts | 16 ++++++++++++ .../tools/handlers/upload-file-reader.ts | 25 ++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index 1343e876df9..e40fd8c3373 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -138,6 +138,22 @@ describe('findMothershipUploadRowByChatAndName', () => { expect(result?.id).toBe('wf_3') }) + + it('resolves a literal-% name via its encoded glob form', async () => { + // Stored name has a literal `%`; glob/upload-context expose it double-encoded + // (`test%252A.zip`). The encoded-form fallback recovers the row. + const row = makeRow({ + id: 'wf_pct', + displayName: 'test%2A.zip', + contentType: 'application/zip', + }) + mockOrderByThenLimit([]) + dbChainMockFns.orderBy.mockResolvedValueOnce([row] as never) + + const result = await findMothershipUploadRowByChatAndName(CHAT_ID, 'test%252A.zip') + + expect(result?.id).toBe('wf_pct') + }) }) describe('listChatUploads', () => { diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index c86167f7e90..80849ecfe09 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -56,6 +56,21 @@ function canonicalUploadKey(name: string): string { } } +/** + * Per-segment encode of a stored name (no decode first), so a name containing a + * literal `%` (e.g. `test%2A.zip`) round-trips: glob/upload-context expose it as + * `encodeVfsSegment(name)`, and matching that encoded form back recovers the row. + * {@link canonicalUploadKey} can't, because it decodes the input first and a + * literal `%2A` is indistinguishable from an encoded `*`. + */ +function encodeUploadName(name: string): string { + try { + return encodeVfsSegment(name) + } catch { + return name.trim() + } +} + /** VFS-visible name. Coalesces to originalName for legacy rows that predate displayName. */ function vfsName(row: typeof workspaceFiles.$inferSelect): string { return row.displayName ?? row.originalName @@ -127,7 +142,15 @@ export async function findMothershipUploadRowByChatAndName( .orderBy(desc(workspaceFiles.uploadedAt), desc(workspaceFiles.id)) const segmentKey = canonicalUploadKey(fileName) - return allRows.find((r) => canonicalUploadKey(vfsName(r)) === segmentKey) ?? null + return ( + allRows.find((r) => { + const stored = vfsName(r) + // Canonical-key match handles visually-equivalent spellings (U+202F vs + // space); the encoded-form match handles literal `%` names that survive + // encode but not decode. + return canonicalUploadKey(stored) === segmentKey || encodeUploadName(stored) === fileName + }) ?? null + ) } /** From d12d9da9bfcf077018b91298c2e8a413d90869c8 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 28 Jun 2026 13:17:02 -0700 Subject: [PATCH 4/7] fix(chat): honor glob depth for archive entries, consistent zip path encoding Address round-2 review: - filter expanded archive entries through the same micromatch matcher as the VFS map (new matchesVfsGlob), so uploads//data/* and /** are scoped correctly instead of returning every entry - build archive vfsPaths/manifest/grep labels with the per-segment encoder (encodeUploadName) so the zip segment matches the broad glob's spelling for literal-% names; canonicalUploadKey stays for resolution only - upload-context now hints glob("uploads//**") to list all entries --- apps/sim/lib/copilot/chat/payload.ts | 4 +-- .../tools/handlers/upload-file-reader.ts | 8 ++--- .../lib/copilot/tools/handlers/vfs.test.ts | 31 +++++++++++++++---- apps/sim/lib/copilot/tools/handlers/vfs.ts | 11 +++++-- apps/sim/lib/copilot/vfs/operations.ts | 10 ++++++ 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/apps/sim/lib/copilot/chat/payload.ts b/apps/sim/lib/copilot/chat/payload.ts index e2f8c0dcab0..744d462aea0 100644 --- a/apps/sim/lib/copilot/chat/payload.ts +++ b/apps/sim/lib/copilot/chat/payload.ts @@ -331,14 +331,14 @@ export async function buildCopilotRequestPayload( } file${entries.length === 1 ? '' : 's'}:`, ...treeLines, '', - `List entries with: glob("uploads/${encodedUploadName}/*")`, + `List entries with: glob("uploads/${encodedUploadName}/**")`, `Read an entry with: read("uploads/${encodedUploadName}/")`, `To save the archive permanently: materialize_file(fileName: "${displayName}")`, ] } else { lines = [ `Archive "${displayName}" (${mediaType}, ${f.size} bytes) uploaded.`, - `List entries with: glob("uploads/${encodedUploadName}/*")`, + `List entries with: glob("uploads/${encodedUploadName}/**")`, `Read an entry with: read("uploads/${encodedUploadName}/")`, `To save the archive permanently: materialize_file(fileName: "${displayName}")`, ] diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 80849ecfe09..1ba324c7310 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -286,7 +286,7 @@ export async function listChatUploadArchiveEntries( return [] } - const encodedZip = canonicalUploadKey(record.name) + const encodedZip = encodeUploadName(record.name) try { const buffer = await fetchWorkspaceFileBuffer(record) const entries = await listArchiveEntries(buffer) @@ -345,7 +345,7 @@ async function buildArchiveManifest( archiveBuffer: Buffer, note?: string ): Promise { - const encodedZip = canonicalUploadKey(record.name) + const encodedZip = encodeUploadName(record.name) try { const entries = await listArchiveEntries(archiveBuffer) const header = `Archive "${record.name}" — ${entries.length} file${ @@ -464,7 +464,7 @@ export async function grepChatUploadPath( type: getMimeTypeFromExtension(ext), ext, }) - const uploadsPath = `uploads/${canonicalUploadKey(record.name)}/${encodeEntryPath(rawPath)}` + const uploadsPath = `uploads/${encodeUploadName(record.name)}/${encodeEntryPath(rawPath)}` return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) } @@ -472,6 +472,6 @@ export async function grepChatUploadPath( if (!result) { throw new WorkspaceFileGrepError(`Upload content not found for "${firstSegment}".`) } - const uploadsPath = `uploads/${canonicalUploadKey(record.name)}` + const uploadsPath = `uploads/${encodeUploadName(record.name)}` return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) } diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts index a028e2b0b2a..82416d350f0 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.test.ts @@ -424,16 +424,18 @@ describe('vfs archive uploads (virtual folders)', () => { vi.clearAllMocks() }) - it('expands a specific archive glob into its entry paths', async () => { + const ARCHIVE_ENTRIES = [ + { path: 'report.pdf', vfsPath: 'uploads/bundle.zip/report.pdf' }, + { path: 'data/sheet.csv', vfsPath: 'uploads/bundle.zip/data/sheet.csv' }, + ] + + it('expands a recursive archive glob (/**) into all entry paths', async () => { const vfs = makeVfs() getOrMaterializeVFS.mockResolvedValue(vfs) listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) - listChatUploadArchiveEntries.mockResolvedValue([ - { path: 'report.pdf', vfsPath: 'uploads/bundle.zip/report.pdf' }, - { path: 'data/sheet.csv', vfsPath: 'uploads/bundle.zip/data/sheet.csv' }, - ]) + listChatUploadArchiveEntries.mockResolvedValue(ARCHIVE_ENTRIES) - const result = await executeVfsGlob({ pattern: 'uploads/bundle.zip/*' }, GREP_CTX_CHAT) + const result = await executeVfsGlob({ pattern: 'uploads/bundle.zip/**' }, GREP_CTX_CHAT) expect(listChatUploadArchiveEntries).toHaveBeenCalledWith('bundle.zip', 'chat-1') expect((result.output as { files: string[] }).files).toEqual( @@ -445,6 +447,23 @@ describe('vfs archive uploads (virtual folders)', () => { ) }) + it('honors glob depth — /* is top-level only, /data/* scopes to data', async () => { + const vfs = makeVfs() + getOrMaterializeVFS.mockResolvedValue(vfs) + listChatUploads.mockResolvedValue([{ name: 'bundle.zip' }]) + listChatUploadArchiveEntries.mockResolvedValue(ARCHIVE_ENTRIES) + + const topLevel = await executeVfsGlob({ pattern: 'uploads/bundle.zip/*' }, GREP_CTX_CHAT) + const topFiles = (topLevel.output as { files: string[] }).files + expect(topFiles).toContain('uploads/bundle.zip/report.pdf') + expect(topFiles).not.toContain('uploads/bundle.zip/data/sheet.csv') + + const scoped = await executeVfsGlob({ pattern: 'uploads/bundle.zip/data/*' }, GREP_CTX_CHAT) + const scopedFiles = (scoped.output as { files: string[] }).files + expect(scopedFiles).toContain('uploads/bundle.zip/data/sheet.csv') + expect(scopedFiles).not.toContain('uploads/bundle.zip/report.pdf') + }) + it('does not expand archives for the broad uploads/* glob', async () => { const vfs = makeVfs() getOrMaterializeVFS.mockResolvedValue(vfs) diff --git a/apps/sim/lib/copilot/tools/handlers/vfs.ts b/apps/sim/lib/copilot/tools/handlers/vfs.ts index 71d5d86c3c7..38807ee5ca3 100644 --- a/apps/sim/lib/copilot/tools/handlers/vfs.ts +++ b/apps/sim/lib/copilot/tools/handlers/vfs.ts @@ -4,7 +4,7 @@ import { TOOL_RESULT_MAX_INLINE_CHARS } from '@/lib/copilot/constants' import type { ExecutionContext, ToolCallResult } from '@/lib/copilot/request/types' import { getOrMaterializeVFS } from '@/lib/copilot/vfs' import type { GrepCountEntry, GrepMatch } from '@/lib/copilot/vfs/operations' -import { WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' +import { matchesVfsGlob, WorkspaceFileGrepError } from '@/lib/copilot/vfs/operations' import { encodeVfsSegment } from '@/lib/copilot/vfs/path-utils' import { grepChatUploadPath, @@ -216,12 +216,17 @@ export async function executeVfsGlob( files = [...files, ...uploadPaths] // Expand a specific archive's entries when the glob reaches inside it - // (uploads//*). Broad uploads/* keeps archives as single leaves. + // (uploads//*). Broad uploads/* keeps archives as single leaves. Entry + // paths are filtered through the same matcher as the VFS map, so the glob's + // depth (`/*` vs `/**` vs `/data/*`) is honored rather than dumping all. const archiveSegment = parseArchiveGlobSegment(pattern) if (archiveSegment) { const entries = await listChatUploadArchiveEntries(archiveSegment, context.chatId) if (entries) { - files = [...files, ...entries.map((entry) => entry.vfsPath)] + const matched = entries + .map((entry) => entry.vfsPath) + .filter((vfsPath) => matchesVfsGlob(vfsPath, pattern)) + files = [...files, ...matched] } } } diff --git a/apps/sim/lib/copilot/vfs/operations.ts b/apps/sim/lib/copilot/vfs/operations.ts index bd7208719b8..23dc4b4e168 100644 --- a/apps/sim/lib/copilot/vfs/operations.ts +++ b/apps/sim/lib/copilot/vfs/operations.ts @@ -94,6 +94,16 @@ const VFS_GLOB_OPTIONS: micromatch.Options = { noext: true, } +/** + * True when `filePath` matches the glob `pattern` under {@link VFS_GLOB_OPTIONS} + * (path-aware `*`/`?`, `**`, no brace/extglob). Exported so callers that build + * their own path list (e.g. virtual archive entries) filter it exactly the way + * {@link glob} filters the VFS map. + */ +export function matchesVfsGlob(filePath: string, pattern: string): boolean { + return micromatch.isMatch(filePath, pattern, VFS_GLOB_OPTIONS) +} + /** * Splits VFS text into lines for line-oriented grep. Strips a trailing CR so Windows-style * CRLF payloads still match patterns anchored at line end (`$`). From 8eb55034de67b2f45fbce640778168122d067964 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 28 Jun 2026 13:27:16 -0700 Subject: [PATCH 5/7] fix(chat): surface archive errors on nested reads/greps instead of not-found A nested archive read/grep ran findArchiveEntryRawPath outside the ArchiveError catch, so an invalid/too-many-entries archive escaped to the generic handler and showed as "Upload not found" (read) or a generic grep failure, while a bare archive read already surfaced the real reason. Widen the catch so both paths report the actual ArchiveError message. --- .../tools/handlers/upload-file-reader.test.ts | 9 +++ .../tools/handlers/upload-file-reader.ts | 68 ++++++++++--------- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index e40fd8c3373..f4b158503f9 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -280,6 +280,15 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { expect(result?.content).toContain('present.txt') }) + it('surfaces an archive error on a nested read instead of null', async () => { + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(Buffer.from('not a zip at all')) + + const result = await readChatUploadPath('bundle.zip', 'entry.txt', CHAT_ID) + + expect(result?.content).toContain('Not a valid .zip archive') + }) + it('rejects an oversized archive WITHOUT downloading it', async () => { mockOrderByThenLimit([ makeRow({ diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 1ba324c7310..e377ef2e8b6 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -306,31 +306,32 @@ export async function listChatUploadArchiveEntries( /** * Render one archive entry from the archive buffer with the same extraction - * logic as a stored upload. Returns null when the entry is missing; returns a - * placeholder result for cap violations. + * logic as a stored upload. Returns null when the entry is genuinely missing; + * returns a bracketed placeholder for any {@link ArchiveError} (invalid archive, + * too many entries, oversized entry) — matching {@link buildArchiveManifest} so a + * nested read surfaces the real reason instead of the VFS "Upload not found". */ async function readArchiveEntry( archiveBuffer: Buffer, entryPath: string ): Promise { - const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) - if (!rawPath) return null - let entryBuffer: Buffer | null try { - entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) return null + const entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + if (!entryBuffer) return null + const ext = getFileExtension(rawPath) + return renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) } catch (err) { if (err instanceof ArchiveError) { return { content: `[${err.message}]`, totalLines: 1 } } throw err } - if (!entryBuffer) return null - const ext = getFileExtension(rawPath) - return renderFileBuffer(entryBuffer, { - name: rawPath, - type: getMimeTypeFromExtension(ext), - ext, - }) } /** @@ -440,32 +441,35 @@ export async function grepChatUploadPath( ) } const archiveBuffer = await fetchWorkspaceFileBuffer(record) - const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) - if (!rawPath) { - throw new WorkspaceFileGrepError( - `Archive entry not found: "${decodeEntryPath(entryPath)}" in "${record.name}".` - ) - } - let entryBuffer: Buffer | null try { - entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) + if (!rawPath) { + throw new WorkspaceFileGrepError( + `Archive entry not found: "${decodeEntryPath(entryPath)}" in "${record.name}".` + ) + } + const entryBuffer = await extractArchiveEntry(archiveBuffer, rawPath) + if (!entryBuffer) { + throw new WorkspaceFileGrepError( + `Archive entry not found: "${rawPath}" in "${record.name}".` + ) + } + const ext = getFileExtension(rawPath) + const result = await renderFileBuffer(entryBuffer, { + name: rawPath, + type: getMimeTypeFromExtension(ext), + ext, + }) + const uploadsPath = `uploads/${encodeUploadName(record.name)}/${encodeEntryPath(rawPath)}` + return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) } catch (err) { + // Surface archive failures (invalid/too-many/oversized) as a grep error + // with the real reason rather than a generic internal failure. if (err instanceof ArchiveError) { throw new WorkspaceFileGrepError(err.message) } throw err } - if (!entryBuffer) { - throw new WorkspaceFileGrepError(`Archive entry not found: "${rawPath}" in "${record.name}".`) - } - const ext = getFileExtension(rawPath) - const result = await renderFileBuffer(entryBuffer, { - name: rawPath, - type: getMimeTypeFromExtension(ext), - ext, - }) - const uploadsPath = `uploads/${encodeUploadName(record.name)}/${encodeEntryPath(rawPath)}` - return grepReadResult(uploadsPath, result, pattern, uploadsPath, options) } const result = await readFileRecord(record) From 74feedf886f6385132818f93aabbcd5cca7be845 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 28 Jun 2026 13:37:55 -0700 Subject: [PATCH 6/7] fix(chat): de-dup archive entries by VFS canonical key, not raw path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile: dedup keyed on the raw sanitized path, but archive paths are exposed through VFS per-segment encoding that NFC-normalizes, so visually-identical NFC/NFD entries (e.g. café.txt) kept both while emitting one shared vfsPath — shadowing the second on read. Move dedup to listChatUploadArchiveEntries / buildArchiveManifest, keyed on the same encodeEntryPath the resolver matches on; listArchiveEntries now returns raw paths (dedup is a VFS-presentation concern). --- .../tools/handlers/upload-file-reader.test.ts | 23 +++++++++++++++++++ .../tools/handlers/upload-file-reader.ts | 23 +++++++++++++++++-- apps/sim/lib/uploads/archive.test.ts | 11 --------- apps/sim/lib/uploads/archive.ts | 15 ++++++------ 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index f4b158503f9..c0e1988b881 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -245,6 +245,29 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { ]) }) + it('de-duplicates entries that collapse to one VFS key (NFC/NFD, ./ prefix)', async () => { + // "café.txt" stored twice (NFC precomposed + NFD decomposed) plus a + // ./-prefixed duplicate of a/b.txt — all collapse to the same VFS path, so + // only one of each must be listed (otherwise the second is unreachable). + const nfc = `caf\u00e9.txt` // precomposed e-acute + const nfd = `cafe\u0301.txt` // e + combining acute + expect(nfc).not.toBe(nfd) + const buffer = await buildZip({ + [nfc]: 'nfc', + [nfd]: 'nfd', + 'a/b.txt': 'first', + './a/b.txt': 'dup', + }) + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + mockFetchWorkspaceFileBuffer.mockResolvedValueOnce(buffer) + + const entries = await listChatUploadArchiveEntries('bundle.zip', CHAT_ID) + const vfsPaths = entries?.map((e) => e.vfsPath) ?? [] + + expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/caf%C3%A9.txt')).toHaveLength(1) + expect(vfsPaths.filter((p) => p === 'uploads/bundle.zip/a/b.txt')).toHaveLength(1) + }) + it('reads a nested entry by its exact path', async () => { const buffer = await buildZip({ 'data/sheet.csv': 'a,b\n1,2' }) mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index e377ef2e8b6..78065c3176a 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -242,6 +242,25 @@ function archiveEntryKey(path: string): string | null { } } +/** + * De-duplicate raw entry paths by their canonical VFS key (first wins), so two + * entries that differ only in a form the VFS normalizes away (NFC vs NFD, U+202F + * vs space, collapsed whitespace) collapse to one listed path. This matches how + * {@link findArchiveEntryRawPath} resolves a read — first entry whose key matches + * — so every listed path is reachable and none is silently shadowed. + */ +function dedupeArchiveEntriesByKey(paths: string[]): string[] { + const seen = new Set() + const result: string[] = [] + for (const path of paths) { + const key = archiveEntryKey(path) ?? path + if (seen.has(key)) continue + seen.add(key) + result.push(path) + } + return result +} + /** * Resolve a requested entry path (percent-encoded as the agent received it from * glob, or the raw display form from the manifest) to the archive's exact stored @@ -289,7 +308,7 @@ export async function listChatUploadArchiveEntries( const encodedZip = encodeUploadName(record.name) try { const buffer = await fetchWorkspaceFileBuffer(record) - const entries = await listArchiveEntries(buffer) + const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(buffer)) return entries.map((path) => ({ path, vfsPath: `uploads/${encodedZip}/${encodeEntryPath(path)}`, @@ -348,7 +367,7 @@ async function buildArchiveManifest( ): Promise { const encodedZip = encodeUploadName(record.name) try { - const entries = await listArchiveEntries(archiveBuffer) + const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(archiveBuffer)) const header = `Archive "${record.name}" — ${entries.length} file${ entries.length === 1 ? '' : 's' }. Read an entry with read("uploads/${encodedZip}/").` diff --git a/apps/sim/lib/uploads/archive.test.ts b/apps/sim/lib/uploads/archive.test.ts index 63e2d36aa52..a3acc55d97b 100644 --- a/apps/sim/lib/uploads/archive.test.ts +++ b/apps/sim/lib/uploads/archive.test.ts @@ -61,17 +61,6 @@ describe('listArchiveEntries', () => { expect(paths).not.toContain('evil2.txt') }) - it('de-duplicates entries that sanitize to the same path', async () => { - const buffer = await buildZip({ - 'a/b.txt': 'first', - './a/b.txt': 'shadowed', - }) - - const paths = await listArchiveEntries(buffer) - - expect(paths).toEqual(['a/b.txt']) - }) - it('filters __MACOSX, .DS_Store and Thumbs.db noise', async () => { const buffer = await buildZip({ 'doc.txt': 'real', diff --git a/apps/sim/lib/uploads/archive.ts b/apps/sim/lib/uploads/archive.ts index 807feced632..5fe4bf60d89 100644 --- a/apps/sim/lib/uploads/archive.ts +++ b/apps/sim/lib/uploads/archive.ts @@ -152,9 +152,12 @@ async function loadArchive(buffer: Buffer): Promise { * Enumerate the safe, extractable entry paths of an archive WITHOUT inflating * them, each a sanitized `/`-joined path (e.g. `data/sheet.csv`). Skips * directories, symlinks, zip-slip paths, and filesystem noise (`__MACOSX/`, - * `.DS_Store`, `Thumbs.db`), and de-duplicates entries that sanitize to the same - * path (e.g. `./a/b` and `a/b`) since only the first is extractable by path. - * Throws {@link ArchiveError} `too_many_entries` past {@link MAX_ARCHIVE_ENTRIES}. + * `.DS_Store`, `Thumbs.db`). Throws {@link ArchiveError} `too_many_entries` past + * {@link MAX_ARCHIVE_ENTRIES}. + * + * Paths are returned raw (not de-duplicated): two entries can collide only once + * projected into the VFS's canonical (NFC-encoded) form, so de-duplication + * belongs with the caller that owns that encoding (`listChatUploadArchiveEntries`). */ export async function listArchiveEntries(buffer: Buffer): Promise { const zip = await loadArchive(buffer) @@ -169,15 +172,11 @@ export async function listArchiveEntries(buffer: Buffer): Promise { ) } - const seen = new Set() const paths: string[] = [] for (const entry of realEntries) { const segments = sanitizeArchiveEntryPath(entry.name) if (!segments || isArchiveNoiseEntry(segments)) continue - const path = segments.join('/') - if (seen.has(path)) continue - seen.add(path) - paths.push(path) + paths.push(segments.join('/')) } return paths } From 99f272eef6f7037e4f57d1f23656786b8cd1a3c1 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 28 Jun 2026 14:56:14 -0700 Subject: [PATCH 7/7] harden(chat): stream-cap archive downloads; refuse bare-archive grep Final-audit hardening for the zip-upload feature: - fetchWorkspaceFileBuffer gains an optional maxBytes that flows to downloadFile, enforced on the actual byte stream. The three archive fetches pass MAX_ARCHIVE_BYTES, so a stored object larger than its client-declared record.size can no longer be buffered fully into memory (record.size stays as a cheap early-out). Fixes a comment that overclaimed download-cap parity. - grepping a bare archive (no entry) now throws a guiding WorkspaceFileGrepError pointing at an entry/manifest, instead of grepping the binary placeholder. --- .../tools/handlers/upload-file-reader.test.ts | 11 ++++++++ .../tools/handlers/upload-file-reader.ts | 25 +++++++++++++------ .../workspace/workspace-file-manager.ts | 10 ++++++-- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts index c0e1988b881..f3a8ec0b498 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.test.ts @@ -31,6 +31,7 @@ vi.mock('@/lib/uploads/contexts/workspace/workspace-file-manager', () => ({ import { findMothershipUploadRowByChatAndName, + grepChatUploadPath, listChatUploadArchiveEntries, listChatUploads, readChatUploadPath, @@ -338,4 +339,14 @@ describe('readChatUploadPath / listChatUploadArchiveEntries (archive)', () => { expect(result?.content).toContain('report.pdf') expect(result?.content).toContain('data/sheet.csv') }) + + it('refuses to grep a bare archive, guiding the agent to an entry', async () => { + mockOrderByThenLimit([makeRow({ displayName: 'bundle.zip', contentType: 'application/zip' })]) + + await expect(grepChatUploadPath('bundle.zip', '', CHAT_ID, 'pattern')).rejects.toThrow( + /Cannot grep an archive directly/ + ) + // The archive bytes are never downloaded or grepped as a binary blob. + expect(mockFetchWorkspaceFileBuffer).not.toHaveBeenCalled() + }) }) diff --git a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts index 78065c3176a..e0fa59e0e39 100644 --- a/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts +++ b/apps/sim/lib/copilot/tools/handlers/upload-file-reader.ts @@ -189,10 +189,11 @@ export function isArchiveUpload(record: WorkspaceFileRecord): boolean { } /** - * True when an archive's stored size exceeds the read cap, so it must not be - * downloaded + parsed inline. Checked against `record.size` BEFORE fetching so an - * oversized archive never gets buffered into memory (the decompress tool applies - * the same {@link MAX_ARCHIVE_BYTES} cap on its own download path). + * True when an archive's recorded size exceeds the read cap. This is a cheap + * early-out on `record.size` to skip a doomed download; the download itself is + * also hard-capped on the actual byte stream (every archive fetch passes + * `{ maxBytes: MAX_ARCHIVE_BYTES }`), so an object larger than its recorded size + * still cannot be buffered fully into memory. */ function exceedsArchiveReadCap(record: WorkspaceFileRecord): boolean { return record.size > MAX_ARCHIVE_BYTES @@ -307,7 +308,7 @@ export async function listChatUploadArchiveEntries( const encodedZip = encodeUploadName(record.name) try { - const buffer = await fetchWorkspaceFileBuffer(record) + const buffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) const entries = dedupeArchiveEntriesByKey(await listArchiveEntries(buffer)) return entries.map((path) => ({ path, @@ -404,7 +405,7 @@ export async function readChatUploadPath( if (exceedsArchiveReadCap(record)) { return archiveTooLargeResult(record) } - const archiveBuffer = await fetchWorkspaceFileBuffer(record) + const archiveBuffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) if (!entryPath) { return await buildArchiveManifest(record, archiveBuffer) } @@ -459,7 +460,7 @@ export async function grepChatUploadPath( `Archive too large to grep: "${record.name}" (limit ${MAX_ARCHIVE_BYTES / 1024 / 1024}MB).` ) } - const archiveBuffer = await fetchWorkspaceFileBuffer(record) + const archiveBuffer = await fetchWorkspaceFileBuffer(record, { maxBytes: MAX_ARCHIVE_BYTES }) try { const rawPath = await findArchiveEntryRawPath(archiveBuffer, entryPath) if (!rawPath) { @@ -491,6 +492,16 @@ export async function grepChatUploadPath( } } + // A bare archive has no searchable text of its own — guide the agent to target + // an entry (or read the archive to list them) rather than grepping its bytes. + if (isArchiveUpload(record)) { + throw new WorkspaceFileGrepError( + `Cannot grep an archive directly. Grep an entry (e.g. grep path: "uploads/${encodeUploadName( + record.name + )}/") or read("uploads/${encodeUploadName(record.name)}") to list its contents.` + ) + } + const result = await readFileRecord(record) if (!result) { throw new WorkspaceFileGrepError(`Upload content not found for "${firstSegment}".`) diff --git a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts index 8e091d8d8f2..ee37397c7d3 100644 --- a/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts +++ b/apps/sim/lib/uploads/contexts/workspace/workspace-file-manager.ts @@ -854,15 +854,21 @@ export async function getWorkspaceFile( } /** - * Download workspace file content + * Download workspace file content. Pass `maxBytes` to cap the download on the + * actual byte stream (not just the client-declared `record.size`), so a stored + * object larger than its recorded size cannot be buffered fully into memory. */ -export async function fetchWorkspaceFileBuffer(fileRecord: WorkspaceFileRecord): Promise { +export async function fetchWorkspaceFileBuffer( + fileRecord: WorkspaceFileRecord, + options: { maxBytes?: number } = {} +): Promise { logger.info(`Downloading workspace file: ${fileRecord.name}`) try { const buffer = await downloadFile({ key: fileRecord.key, context: fileRecord.storageContext ?? 'workspace', + maxBytes: options.maxBytes, }) logger.info( `Successfully downloaded workspace file: ${fileRecord.name} (${buffer.length} bytes)`