From 71796262c15a2ea93129c2d240f6571f64768354 Mon Sep 17 00:00:00 2001 From: Beinan Date: Wed, 3 Jun 2026 07:04:07 -0700 Subject: [PATCH 001/177] feat(java): allow schema override for fragment writes (#6919) ## Issue In distributed writes, workers can create uncommitted fragments and defer the final commit until all fragments are ready. The Java `WriteFragmentBuilder` only exposed `APPEND` mode without a way to pass the target dataset schema, so lance-core had to open the existing dataset to infer schema and field IDs before writing each fragment. That dataset open is unnecessary when the caller already has the target schema, and it becomes expensive for datasets with very large fragment counts because opening the dataset has to load/read manifest metadata. This shows up as fragment writing getting slower as the dataset grows, even before the final commit step. ## Summary - Add `WriteFragmentBuilder.schema(Schema)` so Java distributed writers can pass the target dataset schema when creating uncommitted fragments. - Pass the optional schema through Arrow FFI/JNI into `FragmentCreateBuilder.schema(...)`, avoiding the append-mode dataset open used only for schema inference. - Preserve current base path / object store write parameters when the schema override path is used. - Add Java coverage for append fragment writes with a schema override. ## Benefits - Lets Java callers avoid an expensive dataset open per fragment write in distributed append workflows. - Keeps Lance field IDs from the target dataset schema instead of inferring from the incoming Arrow batches. - Makes the Java API match the underlying Rust `FragmentCreateBuilder` capability. - Reduces write-time overhead for datasets with high fragment counts, especially when many workers are writing fragments concurrently. ## Testing - `cargo check --manifest-path /tmp/lance-write-fragment-schema-pr/java/lance-jni/Cargo.toml` - `./mvnw -Dtest=org.lance.FragmentTest#testWriteFragmentWithSchemaOverride test` --------- Co-authored-by: Beinan Wang --- java/lance-jni/src/fragment.rs | 31 +++++-- java/src/main/java/org/lance/Fragment.java | 85 ++++++++++++++++++- .../java/org/lance/WriteFragmentBuilder.java | 36 +++++++- .../java/org/lance/schema/LanceField.java | 20 +++++ .../java/org/lance/schema/LanceSchema.java | 7 ++ .../src/test/java/org/lance/FragmentTest.java | 69 +++++++++++++++ 6 files changed, 235 insertions(+), 13 deletions(-) diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index a6798c2f237..d6603925947 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -4,7 +4,7 @@ use arrow::array::{RecordBatch, RecordBatchIterator, StructArray}; use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi_and_data_type}; use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; -use arrow_schema::DataType; +use arrow_schema::{DataType, Schema as ArrowSchema}; use jni::objects::{JIntArray, JValue, JValueGen}; use jni::{ JNIEnv, @@ -19,7 +19,7 @@ use lance_io::utils::CachedFileSize; use lance_table::rowids::{RowIdSequence, write_row_ids}; use std::iter::once; -use lance::dataset::fragment::FileFragment; +use lance::dataset::fragment::write::FragmentCreateBuilder; use lance::io::ObjectStoreParams; use lance_datafusion::utils::StreamingWriteSource; use lance_io::object_store::{LanceNamespaceStorageOptionsProvider, StorageOptionsProvider}; @@ -108,6 +108,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -130,6 +131,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, ), JObject::default() ) @@ -155,6 +157,7 @@ fn inner_create_with_ffi_array<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -186,6 +189,7 @@ fn inner_create_with_ffi_array<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, reader, ) } @@ -210,6 +214,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -231,6 +236,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, ), JObject::null() ) @@ -255,6 +261,7 @@ fn inner_create_with_ffi_stream<'local>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -276,6 +283,7 @@ fn inner_create_with_ffi_stream<'local>( table_id_obj, allow_external_blob_outside_bases, blob_pack_file_size_threshold, + schema_addr, reader, ) } @@ -298,6 +306,7 @@ fn create_fragment<'a>( table_id_obj: JObject, // List (can be null) allow_external_blob_outside_bases: JObject, // Optional blob_pack_file_size_threshold: JObject, // Optional + schema_addr: jlong, source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -345,11 +354,19 @@ fn create_fragment<'a>( }); } - let fragments = RT.block_on(FileFragment::create_fragments( - &path_str, - source, - Some(write_params), - ))?; + let mut builder = FragmentCreateBuilder::new(&path_str).write_params(&write_params); + let schema; + if schema_addr != 0 { + let c_schema_ptr = schema_addr as *mut FFI_ArrowSchema; + let c_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema_ptr) }; + let arrow_schema = ArrowSchema::try_from(&c_schema)?; + // Schema::try_from restores Lance field IDs from the LANCE_FIELD_ID_KEY + // metadata inserted by LanceSchema.asArrowSchemaWithFieldIds(). + schema = Schema::try_from(&arrow_schema)?; + builder = builder.schema(&schema); + } + + let fragments = RT.block_on(builder.write_fragments(source))?; export_vec(env, &fragments) } diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index b27b189bf48..3b12e158617 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -18,6 +18,7 @@ import org.lance.ipc.LanceScanner; import org.lance.ipc.ScanOptions; import org.lance.namespace.LanceNamespace; +import org.lance.schema.LanceSchema; import org.apache.arrow.c.ArrowArray; import org.apache.arrow.c.ArrowArrayStream; @@ -260,6 +261,18 @@ static List create( WriteParams params, LanceNamespace namespaceClient, List tableId) { + return create(datasetUri, allocator, root, params, namespaceClient, tableId, null); + } + + /** Create a fragment from the given arrow array and schema. */ + static List create( + String datasetUri, + BufferAllocator allocator, + VectorSchemaRoot root, + WriteParams params, + LanceNamespace namespaceClient, + List tableId, + LanceSchema schema) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(allocator); Preconditions.checkNotNull(root); @@ -267,6 +280,30 @@ static List create( try (ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator); ArrowArray arrowArray = ArrowArray.allocateNew(allocator)) { Data.exportVectorSchemaRoot(allocator, root, null, arrowArray, arrowSchema); + if (schema != null) { + try (ArrowSchema lanceSchema = ArrowSchema.allocateNew(allocator)) { + Data.exportSchema(allocator, schema.asArrowSchemaWithFieldIds(), null, lanceSchema); + return createWithFfiArray( + datasetUri, + arrowArray.memoryAddress(), + arrowSchema.memoryAddress(), + params.getMaxRowsPerFile(), + params.getMaxRowsPerGroup(), + params.getMaxBytesPerFile(), + params.getMode(), + params.getEnableStableRowIds(), + params.getDataStorageVersion(), + params.getStorageOptions(), + params.getBaseStoreParams(), + params.getInitialBases(), + params.getTargetBases(), + namespaceClient, + tableId, + params.getAllowExternalBlobOutsideBases(), + params.getBlobPackFileSizeThreshold(), + lanceSchema.memoryAddress()); + } + } return createWithFfiArray( datasetUri, arrowArray.memoryAddress(), @@ -284,7 +321,8 @@ static List create( namespaceClient, tableId, params.getAllowExternalBlobOutsideBases(), - params.getBlobPackFileSizeThreshold()); + params.getBlobPackFileSizeThreshold(), + 0L); } } @@ -295,9 +333,45 @@ static List create( WriteParams params, LanceNamespace namespaceClient, List tableId) { + return create(datasetUri, null, stream, params, namespaceClient, tableId, null); + } + + /** Create a fragment from the given arrow stream. */ + static List create( + String datasetUri, + BufferAllocator allocator, + ArrowArrayStream stream, + WriteParams params, + LanceNamespace namespaceClient, + List tableId, + LanceSchema schema) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(stream); Preconditions.checkNotNull(params); + if (schema != null) { + Preconditions.checkNotNull(allocator, "allocator is required with schema"); + try (ArrowSchema lanceSchema = ArrowSchema.allocateNew(allocator)) { + Data.exportSchema(allocator, schema.asArrowSchemaWithFieldIds(), null, lanceSchema); + return createWithFfiStream( + datasetUri, + stream.memoryAddress(), + params.getMaxRowsPerFile(), + params.getMaxRowsPerGroup(), + params.getMaxBytesPerFile(), + params.getMode(), + params.getEnableStableRowIds(), + params.getDataStorageVersion(), + params.getStorageOptions(), + params.getBaseStoreParams(), + params.getInitialBases(), + params.getTargetBases(), + namespaceClient, + tableId, + params.getAllowExternalBlobOutsideBases(), + params.getBlobPackFileSizeThreshold(), + lanceSchema.memoryAddress()); + } + } return createWithFfiStream( datasetUri, stream.memoryAddress(), @@ -314,7 +388,8 @@ static List create( namespaceClient, tableId, params.getAllowExternalBlobOutsideBases(), - params.getBlobPackFileSizeThreshold()); + params.getBlobPackFileSizeThreshold(), + 0L); } /** Create a fragment from the given arrow array and schema. */ @@ -335,7 +410,8 @@ private static native List createWithFfiArray( LanceNamespace namespaceClient, List tableId, Optional allowExternalBlobOutsideBases, - Optional blobPackFileSizeThreshold); + Optional blobPackFileSizeThreshold, + long schemaMemoryAddress); /** Create a fragment from the given arrow stream. */ private static native List createWithFfiStream( @@ -354,5 +430,6 @@ private static native List createWithFfiStream( LanceNamespace namespaceClient, List tableId, Optional allowExternalBlobOutsideBases, - Optional blobPackFileSizeThreshold); + Optional blobPackFileSizeThreshold, + long schemaMemoryAddress); } diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 5d7dc1a42b2..2dbef873849 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -14,6 +14,7 @@ package org.lance; import org.lance.namespace.LanceNamespace; +import org.lance.schema.LanceSchema; import org.apache.arrow.c.ArrowArrayStream; import org.apache.arrow.memory.BufferAllocator; @@ -45,6 +46,7 @@ public class WriteFragmentBuilder { private BufferAllocator allocator; private VectorSchemaRoot vectorSchemaRoot; private ArrowArrayStream arrowArrayStream; + private LanceSchema schema; private WriteParams writeParams; private WriteParams.Builder writeParamsBuilder; private LanceNamespace namespaceClient; @@ -100,6 +102,22 @@ public WriteFragmentBuilder data(ArrowArrayStream stream) { return this; } + /** + * Set the Lance dataset schema to use when writing fragments. + * + *

This is useful for distributed writes where workers create uncommitted fragments and a + * coordinator commits them later. When this schema is supplied, lance-core does not need to open + * the existing dataset to infer the schema in APPEND mode. The schema should come from the target + * dataset so Lance field IDs are preserved. + * + * @param schema the target Lance dataset schema + * @return this builder + */ + public WriteFragmentBuilder schema(LanceSchema schema) { + this.schema = schema; + return this; + } + /** * Set the write parameters. * @@ -278,10 +296,22 @@ public List execute() { // storage options provider when these are non-null for credential refresh if (vectorSchemaRoot != null) { return Fragment.create( - datasetUri, allocator, vectorSchemaRoot, finalWriteParams, namespaceClient, tableId); + datasetUri, + allocator, + vectorSchemaRoot, + finalWriteParams, + namespaceClient, + tableId, + schema); } else { return Fragment.create( - datasetUri, arrowArrayStream, finalWriteParams, namespaceClient, tableId); + datasetUri, + allocator, + arrowArrayStream, + finalWriteParams, + namespaceClient, + tableId, + schema); } } @@ -312,6 +342,8 @@ private void validate() { Preconditions.checkState( vectorSchemaRoot == null || allocator != null, "allocator is required when using VectorSchemaRoot"); + Preconditions.checkState( + schema == null || allocator != null, "allocator is required with schema"); Preconditions.checkState( writeParams == null || writeParamsBuilder == null, "Cannot use both writeParams() and individual parameter methods"); diff --git a/java/src/main/java/org/lance/schema/LanceField.java b/java/src/main/java/org/lance/schema/LanceField.java index 9c7014092fa..4dbb3a0ea38 100644 --- a/java/src/main/java/org/lance/schema/LanceField.java +++ b/java/src/main/java/org/lance/schema/LanceField.java @@ -25,6 +25,7 @@ import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -156,6 +157,25 @@ public Field asArrowField() { name, new FieldType(nullable, type, dictionaryEncoding, metadata), arrowChildren); } + Field asArrowFieldWithFieldIds() { + List arrowChildren = + children.stream().map(LanceField::asArrowFieldWithFieldIds).collect(Collectors.toList()); + + if (type instanceof ArrowType.FixedSizeList) { + arrowChildren.addAll(childrenForFixedSizeList()); + } + + if (id < 0) { + throw new IllegalStateException("Lance field id is required for schema override: " + name); + } + Map metadataWithFieldId = new HashMap<>(metadata); + metadataWithFieldId.put(LanceSchema.LANCE_FIELD_ID_KEY, Integer.toString(id)); + return new Field( + name, + new FieldType(nullable, type, dictionaryEncoding, metadataWithFieldId), + arrowChildren); + } + private List childrenForFixedSizeList() { if (logicalType == null || logicalType.isEmpty()) { return Collections.emptyList(); diff --git a/java/src/main/java/org/lance/schema/LanceSchema.java b/java/src/main/java/org/lance/schema/LanceSchema.java index 9492ef45d5e..50a48e578af 100644 --- a/java/src/main/java/org/lance/schema/LanceSchema.java +++ b/java/src/main/java/org/lance/schema/LanceSchema.java @@ -23,6 +23,7 @@ import java.util.stream.Collectors; public class LanceSchema { + static final String LANCE_FIELD_ID_KEY = "lance:field_id"; private final List fields; private final Map metadata; @@ -68,6 +69,12 @@ public Schema asArrowSchema() { fields.stream().map(LanceField::asArrowField).collect(Collectors.toList()), metadata); } + public Schema asArrowSchemaWithFieldIds() { + return new Schema( + fields.stream().map(LanceField::asArrowFieldWithFieldIds).collect(Collectors.toList()), + metadata); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) diff --git a/java/src/test/java/org/lance/FragmentTest.java b/java/src/test/java/org/lance/FragmentTest.java index 61bfc439290..29a21b5258a 100644 --- a/java/src/test/java/org/lance/FragmentTest.java +++ b/java/src/test/java/org/lance/FragmentTest.java @@ -17,9 +17,12 @@ import org.lance.ipc.LanceScanner; import org.lance.ipc.ScanOptions; import org.lance.operation.Merge; +import org.lance.operation.Project; import org.lance.operation.Update; +import org.lance.schema.LanceField; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; @@ -29,6 +32,7 @@ import org.junit.jupiter.api.io.TempDir; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -37,6 +41,7 @@ import java.util.Optional; import java.util.stream.Collectors; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -81,6 +86,70 @@ void testFragmentCreate(@TempDir Path tempDir) throws Exception { } } + @Test + void testWriteFragmentWithSchemaOverride(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("fragment_schema_override").toString(); + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + try (Dataset dataset = testDataset.createEmptyDataset()) { + List fieldList = + new ArrayList<>(testDataset.getSchema().getFields()); + Collections.reverse(fieldList); + + try (Transaction projectTxn = + new Transaction.Builder() + .readVersion(dataset.version()) + .operation(Project.builder().schema(new Schema(fieldList)).build()) + .build(); + Dataset evolvedDataset = new CommitBuilder(dataset).execute(projectTxn); + VectorSchemaRoot root = + VectorSchemaRoot.create(evolvedDataset.getSchema(), allocator)) { + root.allocateNew(); + VarCharVector nameVector = (VarCharVector) root.getVector("name"); + IntVector idVector = (IntVector) root.getVector("id"); + nameVector.setSafe(0, "Person 1".getBytes(StandardCharsets.UTF_8)); + idVector.setSafe(0, 1); + root.setRowCount(1); + + List fragments = + Fragment.write() + .datasetUri(datasetPath) + .allocator(allocator) + .data(root) + .schema(evolvedDataset.getLanceSchema()) + .mode(WriteParams.WriteMode.APPEND) + .execute(); + + assertEquals(1, fragments.size()); + assertEquals(1, fragments.get(0).getPhysicalRows()); + assertArrayEquals( + evolvedDataset.getLanceSchema().fields().stream() + .mapToInt(LanceField::getId) + .toArray(), + fragments.get(0).getFiles().get(0).getFields()); + + FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); + try (Dataset appendedDataset = + Dataset.commit( + allocator, datasetPath, appendOp, Optional.of(evolvedDataset.version())); + ArrowReader reader = appendedDataset.newScan().scanBatches()) { + assertEquals(3, appendedDataset.version()); + assertEquals(1, appendedDataset.countRows()); + assertTrue(reader.loadNextBatch()); + VectorSchemaRoot batch = reader.getVectorSchemaRoot(); + assertEquals(1, batch.getRowCount()); + assertEquals( + "Person 1", + new String( + ((VarCharVector) batch.getVector("name")).get(0), StandardCharsets.UTF_8)); + assertEquals(1, ((IntVector) batch.getVector("id")).get(0)); + } + } + } + } + } + @Test void commitWithoutVersion(@TempDir Path tempDir) { String datasetPath = tempDir.resolve("commit_without_version").toString(); From f317dd18b3ebb2f3b3c0f297537a07a3e9b5d07b Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 4 Jun 2026 00:20:52 +0800 Subject: [PATCH 002/177] feat!: migrate bitmap to index segment based (#6869) Adds Bitmap support to the existing segment-based distributed index workflow. Callers can now build staged Bitmap roots with `create_index_uncommitted(..., index_type="BITMAP", fragment_ids=...)`, finalize them through `create_index_segment_builder().with_index_type("BITMAP").with_segments(...).build_all()`, and publish them with `commit_existing_index_segments(...)`. For Bitmap, `execute_uncommitted` now writes canonical `bitmap_page_lookup.lance` segment roots directly. The old public Python Bitmap shard workflow through `create_scalar_index(..., fragment_ids=...)` and `merge_index_metadata(..., "BITMAP")` is no longer exposed; callers should use the segment workflow instead. Relates to OSS-971 and OSS-972. --- python/python/lance/dataset.py | 36 +++--- python/python/lance/indices/__init__.py | 1 - python/python/tests/test_scalar_index.py | 136 ++++++-------------- rust/lance-index/src/scalar/bitmap.rs | 57 +++++++++ rust/lance/src/dataset.rs | 14 +-- rust/lance/src/index.rs | 14 ++- rust/lance/src/index/create.rs | 154 ++++++++--------------- rust/lance/src/index/scalar.rs | 48 ++++++- rust/lance/src/index/scalar/bitmap.rs | 76 +++++++++++ rust/lance/src/index/scalar_logical.rs | 107 ++++++++++++++++ 10 files changed, 416 insertions(+), 227 deletions(-) create mode 100644 rust/lance/src/index/scalar/bitmap.rs diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 4f7ed434436..743f5f580b6 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3279,12 +3279,12 @@ def create_scalar_index( column, index_type, kwargs ) - if fragment_ids is not None and logical_index_type == "BTREE": + if fragment_ids is not None and logical_index_type in {"BTREE", "BITMAP"}: raise ValueError( - "BTree distributed indexing uses create_index_uncommitted(..., " - 'index_type="BTREE", fragment_ids=...)' + f"{logical_index_type} distributed indexing uses " + "create_index_uncommitted(..., " + f'index_type="{logical_index_type}", fragment_ids=...)' ) - # Add fragment_ids and index_uuid to kwargs if provided if fragment_ids is not None: kwargs["fragment_ids"] = fragment_ids @@ -3975,10 +3975,11 @@ def create_index_uncommitted( """ Create one segment without publishing it and return its metadata. - This is the public distributed-build API for vector and BTREE scalar - index construction. Unlike :meth:`create_index`, this method does not - publish the index into the dataset manifest. Instead, it writes one - segment under ``_indices//`` and returns the resulting + This is the public distributed-build API for vector, BTREE scalar, + and canonical bitmap scalar index construction. Unlike + :meth:`create_index`, this method does not publish the index into the + dataset manifest. Instead, it writes one segment under + ``_indices//`` and returns the resulting :class:`Index` metadata. Callers should: @@ -3991,8 +3992,8 @@ def create_index_uncommitted( 4. commit the final segment list with :meth:`commit_existing_index_segments` - BTREE segments do not yet support the segment builder (steps 3-4); collect - the returned segments and pass them straight to + BTREE segments do not yet support merging; collect the returned + segments and pass them straight to :meth:`commit_existing_index_segments`. Parameters are the same as :meth:`create_index`, with one additional @@ -4005,13 +4006,13 @@ def create_index_uncommitted( Index Metadata for the segment that was written by this call. """ - is_btree_request = ( - isinstance(index_type, str) and index_type.upper() == "BTREE" + is_scalar_segment_request = ( + isinstance(index_type, str) and index_type.upper() in {"BTREE", "BITMAP"} ) or ( isinstance(index_type, IndexConfig) - and index_type.index_type.upper() == "BTREE" + and index_type.index_type.upper() in {"BTREE", "BITMAP"} ) - if is_btree_request: + if is_scalar_segment_request: if fragment_ids is None: raise ValueError( "create_index_uncommitted requires fragment_ids " @@ -4106,9 +4107,10 @@ def merge_index_metadata( """ Merge distributed scalar index metadata. - Vector distributed indexing no longer uses this API. For vector indices, - build segments with :meth:`create_index_uncommitted`, optionally merge - them with :meth:`merge_existing_index_segments`, and publish them with + Vector and Bitmap distributed indexing no longer use this API. For + those index families, build segments with + :meth:`create_index_uncommitted`, optionally merge caller-defined + groups with :meth:`merge_existing_index_segments`, and publish them with :meth:`commit_existing_index_segments`. This method does NOT commit changes. diff --git a/python/python/lance/indices/__init__.py b/python/python/lance/indices/__init__.py index 40dc9ed93ac..675754cc2d0 100644 --- a/python/python/lance/indices/__init__.py +++ b/python/python/lance/indices/__init__.py @@ -30,7 +30,6 @@ class IndexFileVersion(str, Enum): class SupportedDistributedIndices(str, Enum): # Scalar index types BTREE = "BTREE" - BITMAP = "BITMAP" INVERTED = "INVERTED" # Precise vector index types supported by distributed merge diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index d173ef807ba..59654d848df 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -3783,41 +3783,52 @@ def test_distribute_btree_index_build(tmp_path): ) -def _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids): - ds.merge_index_metadata(index_id, index_type="BITMAP") +def test_bitmap_uncommitted_segments_can_be_committed_from_python(tmp_path): + dataset_path = tmp_path / "bitmap_segments.lance" + ds = generate_multi_fragment_bitmap_dataset( + dataset_path, num_fragments=4, rows_per_fragment=40 + ) - from lance.dataset import Index + index_name = "bitmap_segment_idx" + fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] + fragment_groups = [ + fragment_ids[idx : idx + 2] for idx in range(0, len(fragment_ids), 2) + ] + assert len(fragment_groups) >= 2 - field_id = ds.schema.get_field_index("category") - index = Index( - uuid=index_id, - name=index_name, - fields=[field_id], - dataset_version=ds.version, - fragment_ids=set(fragment_ids), - index_version=0, - ) - create_index_op = lance.LanceOperation.CreateIndex( - new_indices=[index], - removed_indices=[], - ) - lance.LanceDataset.commit( - ds.uri, - create_index_op, - read_version=ds.version, - ) - reopened_ds = lance.dataset(ds.uri) + staged_segments = [ + ds.create_index_uncommitted( + column="category", + index_type="BITMAP", + name=index_name, + fragment_ids=fragment_group, + ) + for fragment_group in fragment_groups + ] - stats = reopened_ds.stats.index_stats(index_name) - assert stats["index_type"] == "Bitmap" + assert len({segment.uuid for segment in staged_segments}) == len(staged_segments) + for segment, fragment_group in zip(staged_segments, fragment_groups): + assert segment.fragment_ids == set(fragment_group) + assert any(file.path == "bitmap_page_lookup.lance" for file in segment.files) + assert all(not file.path.startswith("part_") for file in segment.files) + + merged_segment = ds.merge_existing_index_segments(staged_segments) + assert merged_segment.uuid not in {segment.uuid for segment in staged_segments} + assert merged_segment.fragment_ids == set(fragment_ids) + assert any(file.path == "bitmap_page_lookup.lance" for file in merged_segment.files) + assert all(not file.path.startswith("part_") for file in merged_segment.files) + + ds = ds.commit_existing_index_segments(index_name, "category", [merged_segment]) + descriptions = {index.name: index for index in ds.describe_indices()} + assert len(descriptions[index_name].segments) == 1 filter_expr = "category = 3" - without_index = reopened_ds.scanner( + without_index = ds.scanner( filter=filter_expr, columns=["id", "category"], use_scalar_index=False, ).to_table() - with_index = reopened_ds.scanner( + with_index = ds.scanner( filter=filter_expr, columns=["id", "category"], use_scalar_index=True, @@ -3826,78 +3837,11 @@ def _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragmen assert with_index.num_rows == without_index.num_rows assert with_index["id"].to_pylist() == without_index["id"].to_pylist() assert set(with_index["category"].to_pylist()) == {3} - - explain = reopened_ds.scanner( - filter=filter_expr, - use_scalar_index=True, - ).explain_plan() - assert "ScalarIndexQuery" in explain - - empty_without_index = reopened_ds.scanner( - filter="category = 99", - use_scalar_index=False, - ).to_table() - empty_with_index = reopened_ds.scanner( - filter="category = 99", - use_scalar_index=True, - ).to_table() - assert empty_with_index.num_rows == empty_without_index.num_rows == 0 - - -def test_distributed_bitmap_index_build(tmp_path): - ds = generate_multi_fragment_bitmap_dataset( - tmp_path / "bitmap_dist.lance", num_fragments=4, rows_per_fragment=40 - ) - - index_id = str(uuid.uuid4()) - index_name = "bitmap_multiple_fragment_idx" - fragments = ds.get_fragments() - fragment_ids = [fragment.fragment_id for fragment in fragments] - fragment_groups = [ - fragment_ids[idx : idx + 2] for idx in range(0, len(fragment_ids), 2) - ] - assert len(fragment_groups) >= 2 - - for shard_id, fragment_group in enumerate(fragment_groups): - ds.create_scalar_index( - column="category", - index_type=IndexConfig( - index_type="bitmap", - parameters={"shard_id": shard_id}, - ), - name=index_name, - replace=False, - index_uuid=index_id, - fragment_ids=fragment_group, - ) - - _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids) - - -def test_distributed_bitmap_index_build_single_fragment_shards(tmp_path): - ds = generate_multi_fragment_bitmap_dataset( - tmp_path / "bitmap_single_fragment_dist.lance", - num_fragments=4, - rows_per_fragment=40, + assert ( + "ScalarIndexQuery" + in ds.scanner(filter=filter_expr, use_scalar_index=True).explain_plan() ) - index_id = str(uuid.uuid4()) - index_name = "bitmap_single_fragment_idx" - fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] - assert len(fragment_ids) >= 2 - - for fragment_id in fragment_ids: - ds.create_scalar_index( - column="category", - index_type="BITMAP", - name=index_name, - replace=False, - index_uuid=index_id, - fragment_ids=[fragment_id], - ) - - _assert_committed_distributed_bitmap_index(ds, index_id, index_name, fragment_ids) - def test_merge_index_metadata_btree_soft_break(tmp_path): ds = generate_multi_fragment_dataset( diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index eb0276dcb9f..228a522c634 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -1591,6 +1591,63 @@ pub async fn merge_index_files( Ok(()) } +pub async fn merge_bitmap_indices( + source_indices: &[Arc], + dest_store: &dyn IndexStore, + progress: Arc, +) -> Result { + if source_indices.is_empty() { + return Err(Error::invalid_input( + "Bitmap segment merge requires at least one source segment".to_string(), + )); + } + + let value_type = source_indices[0].value_type().clone(); + let mut merged_state = HashMap::::new(); + + progress + .stage_start( + "merge_bitmap_segments", + Some(source_indices.len() as u64), + "segments", + ) + .await?; + for (idx, source_index) in source_indices.iter().enumerate() { + if source_index.value_type() != &value_type { + return Err(Error::invalid_input(format!( + "Bitmap segment has value type {:?}, expected {:?}", + source_index.value_type(), + value_type + ))); + } + + let state = source_index.load_bitmap_index_state().await?; + for (key, bitmap) in state { + merged_state + .entry(key) + .and_modify(|existing| *existing |= &bitmap) + .or_insert(bitmap); + } + progress + .stage_progress("merge_bitmap_segments", (idx + 1) as u64) + .await?; + } + progress.stage_complete("merge_bitmap_segments").await?; + + progress + .stage_start("write_bitmap_index", Some(1), "files") + .await?; + BitmapIndexPlugin::write_bitmap_index(merged_state, dest_store, &value_type).await?; + progress.stage_progress("write_bitmap_index", 1).await?; + progress.stage_complete("write_bitmap_index").await?; + + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()).unwrap(), + index_version: BITMAP_INDEX_VERSION, + files: Some(dest_store.list_files_with_sizes().await?), + }) +} + #[async_trait] impl ScalarIndexPlugin for BitmapIndexPlugin { fn name(&self) -> &str { diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 52e21bd7cba..fdb18398e6e 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -3059,13 +3059,13 @@ impl Dataset { )) } IndexType::Bitmap => { - lance_index::scalar::bitmap::merge_index_files( - self.object_store.as_ref(), - &index_dir, - Arc::new(store), - progress, - ) - .await + Err(Error::invalid_input( + "Bitmap distributed indexing no longer supports merge_index_metadata; \ + build segments with create_index_uncommitted(...), merge them with \ + merge_existing_index_segments(...), and commit with \ + commit_existing_index_segments(...)" + .to_string(), + )) } IndexType::IvfFlat | IndexType::IvfPq | IndexType::IvfSq | IndexType::Vector => { Err(Error::invalid_input( diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index eaa3dc6119d..9b3250f2ca8 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -250,6 +250,13 @@ fn segment_has_inverted_details(segment: &IndexMetadata) -> bool { .is_some_and(|details| details.type_url.ends_with("InvertedIndexDetails")) } +fn segment_has_bitmap_details(segment: &IndexMetadata) -> bool { + segment + .index_details + .as_ref() + .is_some_and(|details| details.type_url.ends_with("BitmapIndexDetails")) +} + // Cache keys for different index types #[derive(Debug, Clone)] pub(crate) struct LegacyVectorIndexCacheKey<'a> { @@ -1069,7 +1076,8 @@ impl DatasetIndexExt for Dataset { } let all_vector = source_segments.iter().all(segment_has_vector_details); let all_inverted = source_segments.iter().all(segment_has_inverted_details); - if !all_vector && !all_inverted { + let all_bitmap = source_segments.iter().all(segment_has_bitmap_details); + if !all_vector && !all_inverted && !all_bitmap { return Err(Error::invalid_input( "merge_existing_index_segments requires all segments to have the same supported index type" .to_string(), @@ -1083,8 +1091,10 @@ impl DatasetIndexExt for Dataset { source_segments, ) .await? - } else { + } else if all_inverted { crate::index::scalar::inverted::merge_segments(self, source_segments).await? + } else { + crate::index::scalar::bitmap::merge_segments(self, source_segments).await? }; merged_segment.dataset_version = self.manifest.version; merged_segment.fields = vec![field_id]; diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index af7ea7ce19c..a20efe63929 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -10,7 +10,7 @@ use crate::{ index::{ DatasetIndexExt, DatasetIndexInternalExt, IntoIndexSegment, build_index_metadata_from_segments, - scalar::build_scalar_index, + scalar::{build_bitmap_index_segment, build_scalar_index}, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, build_empty_vector_index, build_vector_index, @@ -258,17 +258,44 @@ impl<'a> CreateIndexBuilder<'a> { .preprocessed_data .take() .map(|reader| lance_datafusion::utils::reader_to_stream(Box::new(reader))); - build_scalar_index( - self.dataset, - column, - &index_id.to_string(), - ¶ms, - train, - self.fragments.clone(), - preprocesssed_data, - self.progress.clone(), - ) - .await? + if self.index_type == IndexType::Bitmap && self.fragments.is_some() { + if !train { + return Err(Error::invalid_input( + "canonical bitmap segment build requires train=true".to_string(), + )); + } + if preprocesssed_data.is_some() { + return Err(Error::invalid_input( + "canonical bitmap segment build does not accept preprocessed data" + .to_string(), + )); + } + let fragments = self.fragments.clone().ok_or_else(|| { + Error::invalid_input( + "canonical bitmap segment build requires fragment ids".to_string(), + ) + })?; + build_bitmap_index_segment( + self.dataset, + column, + &index_id.to_string(), + fragments, + self.progress.clone(), + ) + .await? + } else { + build_scalar_index( + self.dataset, + column, + &index_id.to_string(), + ¶ms, + train, + self.fragments.clone(), + preprocesssed_data, + self.progress.clone(), + ) + .await? + } } (IndexType::Scalar, LANCE_SCALAR_INDEX) => { // Guess the index type @@ -615,7 +642,6 @@ mod tests { use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_linalg::distance::{DistanceType, MetricType}; - use serde_json::json; use std::{collections::BTreeSet, ops::Bound, sync::Arc}; use uuid::Uuid; @@ -1389,10 +1415,8 @@ mod tests { } #[tokio::test] - async fn test_distributed_build_bitmap() { - use datafusion::common::ScalarValue; - use lance_index::scalar::{SargableQuery, SearchResult, bitmap::BITMAP_LOOKUP_NAME}; - use lance_select::RowSetOps; + async fn test_bitmap_execute_uncommitted_writes_canonical_segment() { + use lance_index::scalar::bitmap::BITMAP_LOOKUP_NAME; let tmpdir = TempStrDir::default(); let dataset_uri = format!("file://{}", tmpdir.as_str()); @@ -1432,69 +1456,15 @@ mod tests { ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap); let fragments = dataset.get_fragments(); let fragment_ids: Vec = fragments.iter().map(|f| f.id() as u32).collect(); - let shared_uuid = Uuid::new_v4().to_string(); - let mut shard_metadata = None; - let shard_groups = fragment_ids.chunks(2).collect::>(); - - for (shard_id, fragment_group) in shard_groups.iter().enumerate() { - let params = base_params - .clone() - .with_params(&json!({ "shard_id": shard_id as u32 })); - let index_metadata = - CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, ¶ms) - .name("distributed_bitmap".to_string()) - .fragments(fragment_group.to_vec()) - .index_uuid(shared_uuid.clone()) - .execute_uncommitted() - .await - .unwrap(); - if shard_metadata.is_none() { - shard_metadata = Some(index_metadata); - } - } - - dataset - .merge_index_metadata( - &shared_uuid, - IndexType::Bitmap, - None, - Arc::new(NoopIndexBuildProgress), - ) - .await - .unwrap(); - - let mut committed_index_metadata = shard_metadata.unwrap(); - committed_index_metadata.fragment_bitmap = Some(fragment_ids.iter().copied().collect()); - committed_index_metadata.files = Some( - list_index_files_with_sizes( - dataset.object_store.as_ref(), - &dataset.indices_dir().clone().join(shared_uuid.clone()), - ) - .await - .unwrap(), - ); - committed_index_metadata.dataset_version = dataset.manifest.version; - - let transaction = TransactionBuilder::new( - dataset.manifest.version, - Operation::CreateIndex { - new_indices: vec![committed_index_metadata], - removed_indices: vec![], - }, - ) - .build(); - dataset - .apply_commit(transaction, &Default::default(), &Default::default()) - .await - .unwrap(); + let selected_fragments = fragment_ids[..2].to_vec(); + let index = + CreateIndexBuilder::new(&mut dataset, &["category"], IndexType::Bitmap, &base_params) + .name("bitmap_segment".to_string()) + .fragments(selected_fragments.clone()) + .execute_uncommitted() + .await + .unwrap(); - let dataset = Dataset::open(&dataset_uri).await.unwrap(); - let indices = dataset - .load_indices_by_name("distributed_bitmap") - .await - .unwrap(); - assert_eq!(indices.len(), 1); - let index = &indices[0]; assert_eq!( index .fragment_bitmap @@ -1502,37 +1472,15 @@ mod tests { .unwrap() .iter() .collect::>(), - fragment_ids + selected_fragments ); let files = index.files.as_ref().unwrap(); assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); assert!( files.iter().all(|file| !file.path.starts_with("part_")), - "committed bitmap index should only reference merged files" + "staged bitmap segment should only reference canonical files" ); - - let scalar_index = crate::index::scalar::open_scalar_index( - &dataset, - "category", - index, - &NoOpMetricsCollector, - ) - .await - .unwrap(); - assert_eq!(scalar_index.index_type(), IndexType::Bitmap); - - let query_result = scalar_index - .search( - &SargableQuery::Equals(ScalarValue::Int32(Some(2))), - &NoOpMetricsCollector, - ) - .await - .unwrap(); - let SearchResult::Exact(query_rows) = query_result else { - panic!("expected exact bitmap result"); - }; - assert_eq!(query_rows.true_rows().len(), Some(2)); } #[tokio::test] diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 18c218ef4f7..05ddb273a93 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -4,6 +4,7 @@ //! Utilities for integrating scalar indices with datasets //! +pub(crate) mod bitmap; pub(crate) mod inverted; pub use inverted::{load_segment_details, load_segments}; @@ -40,7 +41,7 @@ use lance_index::scalar::label_list::{ use lance_index::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, VALUE_COLUMN_NAME, }; -use lance_index::scalar::{CreatedIndex, InvertedIndexParams}; +use lance_index::scalar::{BuiltinIndexType, CreatedIndex, InvertedIndexParams}; use lance_index::scalar::{ ScalarIndex, ScalarIndexParams, bitmap::BITMAP_LOOKUP_NAME, inverted::INVERT_LIST_FILE, lance_format::LanceIndexStore, @@ -321,6 +322,51 @@ pub(super) async fn build_scalar_index( Ok(created_index) } +/// Build a canonical bitmap index segment over a caller-selected fragment set. +/// +/// This is intentionally separate from `build_scalar_index(..., fragment_ids=Some(...))`. +/// The latter is the legacy distributed scalar-index shard path. Here fragment ids only +/// restrict the scanned rows; the bitmap plugin receives no shard id and writes the +/// canonical bitmap layout for the staged segment root. +#[instrument(level = "debug", skip_all)] +pub(super) async fn build_bitmap_index_segment( + dataset: &Dataset, + column: &str, + uuid: &str, + fragment_ids: Vec, + progress: Arc, +) -> Result { + let field = dataset + .schema() + .field(column) + .ok_or(Error::invalid_input_source( + format!("No column with name {}", column).into(), + ))?; + let field: arrow_schema::Field = field.into(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + let plugin = SCALAR_INDEX_PLUGIN_REGISTRY.get_plugin_by_name(¶ms.index_type)?; + let training_request = + plugin.new_training_request(params.params.as_deref().unwrap_or("{}"), &field)?; + let criteria = training_request.criteria(); + + progress.stage_start("load_data", None, "rows").await?; + let training_data = + load_training_data(dataset, column, criteria, None, true, Some(fragment_ids)).await?; + progress.stage_complete("load_data").await?; + + let index_store = LanceIndexStore::from_dataset_for_new(dataset, uuid)?; + plugin + .train_index( + training_data, + &index_store, + training_request, + None, + progress, + ) + .await +} + /// Fetches the scalar index plugin for a given index metadata /// /// The fast path, on newer datasets, is just a plugin lookup by the type URL of the index details. diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs new file mode 100644 index 00000000000..11214a9bfdc --- /dev/null +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::scalar::bitmap::BitmapIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use std::sync::Arc; +use uuid::Uuid; + +use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; + +/// Merge one caller-defined group of source bitmap segments into a single segment. +pub(in crate::index) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let field_path = dataset.schema().field_path(field_id)?; + + let mut source_indices = Vec::with_capacity(segments.len()); + let mut fragment_bitmap = RoaringBitmap::new(); + for segment in &segments { + fragment_bitmap |= segment.fragment_bitmap.as_ref().cloned().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + )) + })?; + let scalar_index = + super::open_scalar_index(dataset, &field_path, segment, &NoOpMetricsCollector).await?; + let bitmap_index = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "merge_existing_index_segments: expected bitmap segment {}, got {:?}", + segment.uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(Arc::new(bitmap_index.clone())); + } + + let new_uuid = Uuid::new_v4(); + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &new_uuid.to_string())?; + let created_index = lance_index::scalar::bitmap::merge_bitmap_indices( + &source_indices, + &new_store, + lance_index::progress::noop_progress(), + ) + .await?; + + Ok(IndexMetadata { + uuid: new_uuid, + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: created_index.files, + ..segments[0].clone() + }) +} diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index 162a36a0c97..d2ab4e4b9f0 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -323,6 +323,7 @@ mod tests { use lance_datagen::array; use lance_index::IndexType; use lance_index::metrics::NoOpMetricsCollector; + use lance_index::scalar::bitmap::BITMAP_LOOKUP_NAME; use lance_index::scalar::{BuiltinIndexType, SargableQuery, ScalarIndexParams}; use crate::index::create::CreateIndexBuilder; @@ -498,6 +499,112 @@ mod tests { ); } + #[tokio::test] + async fn test_bitmap_segments_commit_and_query_as_logical_index() { + let test_dir = TempStrDir::default(); + let dataset = lance_datagen::gen_batch() + .col("value", array::step::()) + .into_dataset( + test_dir.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(16), + ) + .await + .unwrap(); + let mut dataset = dataset; + let fragments = dataset.get_fragments(); + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + let mut staged = Vec::new(); + + for fragment_group in fragments.chunks(2) { + let fragment_ids = fragment_group + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>(); + let segment = + CreateIndexBuilder::new(&mut dataset, &["value"], IndexType::Bitmap, ¶ms) + .name("value_bitmap".to_string()) + .fragments(fragment_ids.clone()) + .execute_uncommitted() + .await + .unwrap(); + assert_eq!( + segment + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragment_ids + ); + let files = segment.files.as_ref().unwrap(); + assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); + assert!(files.iter().all(|file| !file.path.starts_with("part_"))); + staged.push(segment); + } + + let staged_uuids = staged + .iter() + .map(|segment| segment.uuid) + .collect::>(); + let merged = dataset.merge_existing_index_segments(staged).await.unwrap(); + assert!(!staged_uuids.contains(&merged.uuid)); + assert_eq!( + merged + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect::>(), + fragments + .iter() + .map(|fragment| fragment.id() as u32) + .collect::>() + ); + let files = merged.files.as_ref().unwrap(); + assert!(files.iter().any(|file| file.path == BITMAP_LOOKUP_NAME)); + assert!(files.iter().all(|file| !file.path.starts_with("part_"))); + + dataset + .commit_existing_index_segments("value_bitmap", "value", vec![merged]) + .await + .unwrap(); + + let committed = dataset.load_indices_by_name("value_bitmap").await.unwrap(); + assert_eq!(committed.len(), 1); + assert_eq!( + scalar_index_fragment_bitmap(&dataset, "value", "value_bitmap") + .await + .unwrap() + .unwrap(), + dataset.fragment_bitmap.as_ref().clone() + ); + + let logical = + open_named_scalar_index(&dataset, "value", "value_bitmap", &NoOpMetricsCollector) + .await + .unwrap(); + assert_eq!(logical.index_type(), IndexType::Bitmap); + + let query = SargableQuery::Equals(ScalarValue::Int32(Some(20))); + let result = logical.search(&query, &NoOpMetricsCollector).await.unwrap(); + let row_addrs = match result { + SearchResult::Exact(row_addrs) => row_addrs, + other => panic!( + "expected exact result from segmented bitmap, got {:?}", + other + ), + }; + + let searched_fragments = row_addrs + .true_rows() + .row_addrs() + .unwrap() + .map(|row_addr| RowAddress::from(u64::from(row_addr)).fragment_id()) + .collect::>(); + assert_eq!(searched_fragments, vec![1]); + } + #[tokio::test] async fn test_zonemap_segment_search_keeps_fragment_ids() { let dataset = lance_datagen::gen_batch() From 880745bda8833670463499c75916a2e77e2ba01e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 3 Jun 2026 09:29:15 -0700 Subject: [PATCH 003/177] fix(filtered-read): record IO metrics even when filter matches no rows (#7057) In the OnePartitionMultipleThreads path, io_metrics.record was only called inside inspect_ok on the output batch stream. When a filter produces zero matching rows, no batches flow through and inspect_ok never fires, leaving bytes_read/iops/requests at 0 despite I/O having occurred. Fix by also recording a final snapshot in the finally handler. --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance/src/io/exec/filtered_read.rs | 56 +++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs index c491e3f194b..d0f0b229cb2 100644 --- a/rust/lance/src/io/exec/filtered_read.rs +++ b/rust/lance/src/io/exec/filtered_read.rs @@ -997,6 +997,10 @@ impl FilteredReadStream { base_batch_stream.boxed() }; + // Clone so the finally handler can record a final snapshot even when + // no output batches were produced (inspect_ok never fires in that case). + let global_metrics_final = global_metrics.clone(); + let scan_scheduler_final = scan_scheduler.clone(); let batch_stream = batch_stream .inspect_ok(move |batch| { partition_metrics_clone @@ -1005,6 +1009,9 @@ impl FilteredReadStream { global_metrics.io_metrics.record(&scan_scheduler); }) .finally(move || { + global_metrics_final + .io_metrics + .record(&scan_scheduler_final); partition_metrics.baseline_metrics.done(); }) .map_err(|e: lance_core::Error| DataFusionError::External(e.into())) @@ -3716,6 +3723,55 @@ mod tests { assert!(iops > 0, "Should have recorded IO operations"); } + // Reproduces a bug where bytes_read (and iops/requests) stay at 0 when a filter matches + // no rows. io_metrics.record is only called inside inspect_ok on the output batch stream, + // so when the filter produces zero output batches, the I/O that did occur is never counted. + #[tokio::test] + async fn test_io_metrics_recorded_when_filter_matches_no_rows() { + let fixture = TestFixture::new().await; + // not_indexed values in the fixture go up to ~400; this filter matches nothing + let filter_plan = fixture.filter_plan("not_indexed > 10000", false).await; + let options = + FilteredReadOptions::basic_full_read(&fixture.dataset).with_filter_plan(filter_plan); + let filtered_read = + Arc::new(FilteredReadExec::try_new(fixture.dataset.clone(), options, None).unwrap()); + + let batches = filtered_read + .execute(0, Arc::new(TaskContext::default())) + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + batches.iter().map(|b| b.num_rows()).sum::(), + 0, + "filter should match no rows" + ); + + let metrics = filtered_read.metrics().unwrap(); + + let rows_scanned = metrics + .sum_by_name("rows_scanned") + .map(|v| v.as_usize()) + .unwrap_or(0); + assert!( + rows_scanned > 0, + "rows_scanned ({}) should be > 0: data was read even though filter matched nothing", + rows_scanned + ); + + let bytes_read = metrics + .sum_by_name("bytes_read") + .map(|v| v.as_usize()) + .unwrap_or(0); + assert!( + bytes_read > 0, + "bytes_read ({}) should be > 0: io_metrics.record is only called when output batches \ + are produced, so bytes_read stays 0 even though I/O occurred", + bytes_read + ); + } + /// Test that direct execution gives the same result as get_plan + execute_with_plan #[test_log::test(tokio::test)] async fn test_plan_round_trip() { From e7cbad9cc4cc4114269083329ad54f3c904ebc0c Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Wed, 3 Jun 2026 23:44:47 +0700 Subject: [PATCH 004/177] perf: make HNSW cheaper to load (#6798) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Closes #6746. Loading an HNSW partition no longer reconstructs a per-node `Vec` / `Vec` graph. The loaded graph is now backed directly by the on-disk Arrow buffers, with neighbor adjacency served as zero-copy `&[u32]` slices straight out of the `__neighbors` `ListArray` value buffer. This unblocks a future zero-copy `CacheCodec` (#6745). ## Motivation Per #6746, loading an HNSW partition required expensive per-node reconstruction, which makes a zero-copy IPC `CacheCodec` (#6745) infeasible. The fix is to keep the Arrow data and offsets as the graph's backing store while preserving current search behavior and performance. ## What changed - `HnswCore` now holds an `HnswGraph` enum instead of `Arc>`: `Built` (in-memory, produced by the online builder / `index_vectors` — build path untouched) or `Loaded` (Arrow-backed, search-only). - `LoadedHnswGraph` retains the full `RecordBatch` plus per-level zero-copy `ListArray` neighbor views and a tiny per-upper-level `id -> row` lookup; the geometrically-shrinking upper levels keep these maps negligible. - Level 0 uses a `Dense` lookup (`row == __vector_id`, asserted in debug); upper levels use a `Sparse` map keyed by `__vector_id` value, exactly mirroring the old per-node `load` — including the known `level_offsets` quirk where the entry-point node is written by `to_batch` at every level but counted only at level 0, so upper-level slices are off-by-one and duplicate ids resolve last-write-wins. - The search loop is single-sourced across both backends via a local macro, keeping the existing `Graph` / `BorrowingGraph` seam; search is unchanged. - `to_batch()` on a loaded graph is a verbatim passthrough (re-stamped metadata only), so the IVF partition cache (`ivf/partition_serde.rs`, which re-serializes loaded indices) round-trips losslessly and #6745 can write/read it through `lance_arrow::ipc` without rebuilding the graph. ## Correctness & compatibility - Loaded-graph search is bit-identical to the in-memory build across L2 / Dot / Hamming and graph sizes (single node, pair, multi-level 2048). - Old `load` semantics are preserved bit-for-bit, including duplicate-id last-write-wins across a misaligned slice boundary; `build -> to_batch -> load -> to_batch` is byte-stable (`b1 == b2`). - No public API signature change. `HNSW::nodes()` now panics on a loaded graph (documented; `GraphBuilderNode` is internal API and there are no in-tree callers). ## Benchmarks `criterion --quick`, 100000×128, L2, k=100, ef=300 (`rust/lance-index/benches/hnsw.rs`). The "before" `load_hnsw` was measured by running this same bench against the parent commit's reconstruction-based `builder.rs` (only that file swapped), so it is a like-for-like `HNSW::load` comparison. | Benchmark | Before (reconstruction load) | After (Arrow-backed) | Δ | | --- | --- | --- | --- | | `load_hnsw(100000x128)` | ~127 ms | ~90.8 µs | ~1,400× faster | | `search_hnsw100000x128` (built, baseline) | ~700.7 µs | ~700.7 µs | unchanged | | `search_hnsw_loaded100000x128` | n/a | ~690.4 µs | on par with built (within noise) | Load drops from ~127 ms (allocating 100k `GraphBuilderNode`s + per-node `OrderedNode` adjacency) to ~91 µs (batch slice + tiny upper-level sparse maps), while search on the Arrow-backed graph stays on par with the in-memory build. Numbers are `--quick`/indicative; the ~3-orders-of-magnitude load delta is well outside noise. Re-run a full `cargo bench` before merge for headline figures. ## Tests All in `rust/lance-index/src/vector/hnsw/builder.rs`: - `test_loaded_search_parity_and_recall` (rstest: L2 single / L2 pair / L2 2048 / Dot 2048) — built vs loaded parity plus recall ≥ 0.5. - `test_loaded_level_offsets_misalignment_invariant` — pins the entry-point-written-at-every-level surplus (`batch.num_rows() > sum(level_count)`), the Dense level-0 precondition, and loaded↔built search parity despite the misalignment. - `test_loaded_empty_index` — 0-row `to_batch` → `load` → empty graph round-trip. - `test_to_batch_roundtrip_loaded` — the IVF partition-cache path: `to_batch` on a loaded index is byte-stable and reloads/searches identically. - `test_loaded_graph_is_arrow_backed` — loaded graph is strictly lighter than the built representation. - Pre-existing `test_builder_write_load` (2048, L2, file round-trip) and `test_builder_write_load_binary_hamming` (256, Hamming) continue to pass unchanged. --------- Co-authored-by: Vova Kolmakov Co-authored-by: Claude Opus 4.7 (1M context) --- rust/lance-index/benches/hnsw.rs | 65 +- rust/lance-index/src/vector/hnsw/builder.rs | 747 ++++++++++++++++++-- 2 files changed, 758 insertions(+), 54 deletions(-) diff --git a/rust/lance-index/benches/hnsw.rs b/rust/lance-index/benches/hnsw.rs index 1aafd30188c..0a9b10bf42c 100644 --- a/rust/lance-index/benches/hnsw.rs +++ b/rust/lance-index/benches/hnsw.rs @@ -95,6 +95,67 @@ fn bench_hnsw(c: &mut Criterion) { }); } +fn bench_hnsw_load(c: &mut Criterion) { + const DIMENSION: usize = 128; + const TOTAL: usize = 100_000; + const SEED: [u8; 32] = [42; 32]; + const K: usize = 100; + + let rt = tokio::runtime::Runtime::new().unwrap(); + + let data = generate_random_array_with_seed::(TOTAL * DIMENSION, SEED); + let fsl = FixedSizeListArray::try_new_from_values(data, DIMENSION as i32).unwrap(); + let vectors = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + + let search_build_pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap(); + let hnsw = search_build_pool + .install(|| HNSW::index_vectors(vectors.as_ref(), HnswBuildParams::default())) + .unwrap(); + let batch = hnsw.to_batch().unwrap(); + + // Load cost -- the path #6746 targets. `RecordBatch::clone` is an Arrow + // refcount bump (what production does anyway: each partition-cache IPC + // read yields a fresh batch), so it does not mask the load work measured. + c.bench_function(format!("load_hnsw({TOTAL}x{DIMENSION})").as_str(), |b| { + b.iter(|| { + let loaded = HNSW::load(batch.clone()).unwrap(); + assert_eq!(loaded.len(), TOTAL); + }) + }); + + // Search on the Arrow-backed loaded graph -- same TOTAL/DIMENSION/K/ef as + // the `search_hnsw` bench, so the two are directly comparable and confirm + // the new backend keeps search latency unchanged (issue #6746). + let loaded = HNSW::load(batch).unwrap(); + let query = fsl.value(0); + c.bench_function( + format!("search_hnsw_loaded{TOTAL}x{DIMENSION}").as_str(), + |b| { + b.to_async(&rt).iter(|| async { + let uids: HashSet = loaded + .search_basic( + query.clone(), + K, + &HnswQueryParams { + ef: 300, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }, + None, + vectors.as_ref(), + ) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + + assert_eq!(uids.len(), K); + }) + }, + ); +} + fn bench_hnsw_sq(c: &mut Criterion) { const DIMENSION: usize = 128; const TOTAL: usize = 100_000; @@ -291,7 +352,7 @@ criterion_group!( .measurement_time(Duration::from_secs(10)) .sample_size(10) .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); - targets = bench_hnsw, bench_hnsw_sq, bench_hnsw_pq); + targets = bench_hnsw, bench_hnsw_load, bench_hnsw_sq, bench_hnsw_pq); // Non-linux version does not support pprof. #[cfg(not(target_os = "linux"))] @@ -300,6 +361,6 @@ criterion_group!( config = Criterion::default() .measurement_time(Duration::from_secs(10)) .sample_size(10); - targets = bench_hnsw, bench_hnsw_sq, bench_hnsw_pq); + targets = bench_hnsw, bench_hnsw_load, bench_hnsw_sq, bench_hnsw_pq); criterion_main!(benches); diff --git a/rust/lance-index/src/vector/hnsw/builder.rs b/rust/lance-index/src/vector/hnsw/builder.rs index cf5c8864281..789dc7ef904 100644 --- a/rust/lance-index/src/vector/hnsw/builder.rs +++ b/rust/lance-index/src/vector/hnsw/builder.rs @@ -5,8 +5,8 @@ use arrow::array::{AsArray, ListBuilder, UInt32Builder}; use arrow::compute::concat_batches; -use arrow::datatypes::{DataType, Float32Type, UInt32Type}; -use arrow_array::{ArrayRef, Float32Array, RecordBatch, UInt64Array}; +use arrow::datatypes::{DataType, UInt32Type}; +use arrow_array::{ArrayRef, Float32Array, ListArray, RecordBatch, UInt64Array}; use crossbeam_queue::ArrayQueue; use deepsize::DeepSizeOf; use itertools::Itertools; @@ -42,7 +42,7 @@ use crate::vector::graph::{ use crate::vector::graph::{Visited, beam_search_borrowed, greedy_search, greedy_search_borrowed}; use crate::vector::storage::{DistCalculator, VectorStore}; use crate::vector::v3::subindex::IvfSubIndex; -use crate::vector::{DIST_COL, Query, VECTOR_RESULT_SCHEMA}; +use crate::vector::{Query, VECTOR_RESULT_SCHEMA}; pub const HNSW_METADATA_KEY: &str = "lance:hnsw"; @@ -158,7 +158,7 @@ pub struct HNSW { struct HnswCore { params: HnswBuildParams, - nodes: Arc>, + graph: HnswGraph, level_count: Vec, entry_point: u32, visited_generator_queue: Arc>, @@ -167,7 +167,7 @@ struct HnswCore { impl DeepSizeOf for HnswCore { fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { self.params.deep_size_of_children(context) - + self.nodes.deep_size_of_children(context) + + self.graph.deep_size_of_children(context) + self.level_count.deep_size_of_children(context) // Skipping the visited_generator_queue } @@ -181,10 +181,6 @@ impl HnswCore { fn num_nodes(&self, level: usize) -> usize { self.level_count[level] } - - fn nodes(&self) -> Arc> { - self.nodes.clone() - } } impl Debug for HNSW { @@ -210,7 +206,7 @@ impl HNSW { Self { inner: Arc::new(HnswCore { params, - nodes: Arc::new(nodes), + graph: HnswGraph::Built(Arc::new(nodes)), level_count, entry_point, visited_generator_queue, @@ -222,7 +218,7 @@ impl HNSW { Self { inner: Arc::new(HnswCore { params: HnswBuildParams::default(), - nodes: Arc::new(Vec::new()), + graph: HnswGraph::Built(Arc::new(Vec::new())), level_count: Vec::new(), entry_point: 0, visited_generator_queue: Arc::new(ArrayQueue::new(1)), @@ -231,7 +227,11 @@ impl HNSW { } pub fn len(&self) -> usize { - self.inner.nodes.len() + match &self.inner.graph { + HnswGraph::Built(nodes) => nodes.len(), + // `level_count[0]` is the bottom-level (== total) node count. + HnswGraph::Loaded(graph) => graph.level_count[0], + } } pub fn is_empty(&self) -> bool { @@ -246,8 +246,15 @@ impl HNSW { self.inner.num_nodes(level) } - pub fn nodes(&self) -> Arc> { - self.inner.nodes() + /// Returns the in-memory builder nodes, if this graph was freshly built. + /// + /// A disk-loaded graph is Arrow-backed and has no `GraphBuilderNode`s, + /// so this returns `None` for it. + pub fn nodes(&self) -> Option>> { + match &self.inner.graph { + HnswGraph::Built(nodes) => Some(nodes.clone()), + HnswGraph::Loaded(_) => None, + } } #[allow(clippy::too_many_arguments)] @@ -263,32 +270,96 @@ impl HNSW { ) -> Result> { let dist_calc = storage.dist_calculator(query, params.dist_q_c); let entry = self.inner.entry_point; - let mut ep = OrderedNode::new(entry, dist_calc.distance(entry).into()); - let nodes = self.inner.nodes.as_ref(); + let ep = OrderedNode::new(entry, dist_calc.distance(entry).into()); + + // The level descent + bottom beam search are identical across + // graph backends; only the view types differ. `run_search` is + // generic over those view types so the loop is single-sourced: + // each backend supplies a per-level view closure and a + // bottom-level view. + let result = match &self.inner.graph { + HnswGraph::Built(nodes) => { + let nodes = nodes.as_slice(); + self.run_search( + ep, + k, + params, + bitset.as_ref(), + visited_generator, + storage.len(), + prefetch_distance, + &dist_calc, + |level| ImmutableHnswLevelView::new(level, nodes), + ImmutableHnswBottomView::new(nodes), + ) + } + HnswGraph::Loaded(graph) => { + let graph = graph.as_ref(); + self.run_search( + ep, + k, + params, + bitset.as_ref(), + visited_generator, + storage.len(), + prefetch_distance, + &dist_calc, + |level| LoadedHnswLevelView::new(level, graph), + LoadedHnswBottomView::new(graph), + ) + } + }; + Ok(result) + } + + /// Drives the shared HNSW query path over backend-specific graph + /// views: a per-level view produced by `make_level` and a + /// bottom-level view `bottom`. The views borrow their backing store + /// and are created, used, and dropped entirely within this call; + /// only the owned result escapes. Monomorphizing over `L`/`B` is the + /// single seam that lets the in-memory and disk-loaded backends + /// share one search loop. + #[allow(clippy::too_many_arguments)] + fn run_search( + &self, + ep: OrderedNode, + k: usize, + params: &HnswQueryParams, + bitset: Option<&Visited>, + visited_generator: &mut VisitedGenerator, + storage_len: usize, + prefetch_distance: Option, + dist_calc: &impl DistCalculator, + make_level: impl Fn(u16) -> L, + bottom: B, + ) -> Vec + where + L: BorrowingGraph, + B: BorrowingGraph, + { + let mut ep = ep; for level in (0..self.max_level()).rev() { - let cur_level = ImmutableHnswLevelView::new(level, nodes); + let cur_level = make_level(level); ep = greedy_search_borrowed( &cur_level, ep, - &dist_calc, + dist_calc, self.inner.params.prefetch_distance, ); } - - let bottom_level = ImmutableHnswBottomView::new(nodes); - let mut visited = visited_generator.generate(storage.len()); - Ok(beam_search_borrowed( - &bottom_level, + let mut visited = visited_generator.generate(storage_len); + beam_search_borrowed( + &bottom, &ep, params, - &dist_calc, - bitset.as_ref(), + dist_calc, + bitset, prefetch_distance, &mut visited, ) .into_iter() .take(k) - .collect()) + .collect::>() } #[instrument(level = "debug", skip(self, query, bitset, storage))] @@ -456,7 +527,7 @@ impl HnswBuilder { HNSW { inner: Arc::new(HnswCore { params: self.params, - nodes: Arc::new(nodes), + graph: HnswGraph::Built(Arc::new(nodes)), level_count, entry_point: self.entry_point, visited_generator_queue: self.visited_generator_queue, @@ -716,6 +787,177 @@ impl BorrowingGraph for ImmutableHnswBottomView<'_> { } } +/// Per-level node-id -> row-index lookup for a disk-loaded HNSW graph. +enum LevelLookup { + /// `row == node id`. Used only for level 0, where [`HNSW::to_batch`] + /// writes every node once in ascending `__vector_id` (== node id) order, + /// so the level-0 slice is exactly `[0, N)` with `row == id`. + Dense, + /// Upper level: an explicit `node_id -> row` map built from the level's + /// `__vector_id` column. + /// + /// We do *not* assume the column is sorted or that the slice is aligned + /// to a true level boundary: `level_offsets`/`level_count` omit the + /// entry-point node (it is written at every level by `to_batch` but only + /// counted at level 0), so upper-level slices can be off-by-one and + /// non-monotonic. Keying by the `__vector_id` value -- exactly what the + /// old per-node `load` did -- preserves behavior bit-for-bit. Upper + /// levels shrink geometrically, so this map stays tiny. + Sparse(HashMap), +} + +/// A search-only HNSW graph backed directly by the Arrow buffers of the +/// on-disk `RecordBatch`. +/// +/// Loading performs no per-node reconstruction: neighbor adjacency is served +/// as `&[u32]` slices straight out of the `__neighbors` `ListArray` value +/// buffer (zero copy). The full `batch` is retained so [`HNSW::to_batch`] is a +/// near-free passthrough -- required, because the IVF partition cache +/// re-serializes loaded indices through `to_batch()` +/// (`lance/src/index/vector/ivf/partition_serde.rs`) -- and so a future +/// zero-copy `CacheCodec` (#6745) can write/read it through +/// `lance_arrow::ipc` without rebuilding the graph. +struct LoadedHnswGraph { + /// The full loaded batch (all levels concatenated, level 0 first), + /// retained verbatim for `to_batch()` and #6745. + batch: RecordBatch, + /// Per-level `__neighbors` `List`, zero-copy slices of `batch`. + level_neighbors: Vec, + /// Per-level node-id -> row lookup (see [`LevelLookup`]). + level_lookup: Vec, + /// Number of nodes present at each level (`level_count[0]` == total). + level_count: Vec, +} + +impl DeepSizeOf for LoadedHnswGraph { + fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + // `level_neighbors` are zero-copy views into `batch`, so counting + // `batch` alone avoids double counting (mirrors + // `vector/flat/storage.rs`). The upper-level `level_lookup` maps are + // sized to the geometrically-shrinking node counts above level 0 -- + // negligible next to the batch and not separately accounted here. + self.batch.get_array_memory_size() + } +} + +impl LoadedHnswGraph { + /// Borrow the neighbor ids of `key` at `level` directly from the Arrow + /// `ListArray` value buffer -- no allocation, no copy. + #[inline] + fn neighbors_at(&self, level: usize, key: u32) -> &[u32] { + let row = match &self.level_lookup[level] { + LevelLookup::Dense => key as usize, + LevelLookup::Sparse(id_to_row) => match id_to_row.get(&key) { + Some(&row) => row as usize, + // The node is absent at this level -- e.g. an empty upper + // level the search descends through, or a node that only + // exists at lower levels. Mirror the old representation + // (`level_neighbors[level]` defaulted to empty): no + // neighbors here, so greedy search stays put and descends. + None => return &[], + }, + }; + let list = &self.level_neighbors[level]; + let offsets = list.value_offsets(); + let start = offsets[row] as usize; + let end = offsets[row + 1] as usize; + // The `__neighbors` list child is `UInt32` per `HNSW::schema()`. + // Validity bitmap is ignored on purpose: `to_batch` never writes null + // neighbor lists, matching the previous `.unwrap()`-based load. + let values = list.values().as_primitive::(); + &values.values()[start..end] + } +} + +/// Per-level search view over a disk-loaded [`LoadedHnswGraph`]. +pub(crate) struct LoadedHnswLevelView<'a> { + level: usize, + graph: &'a LoadedHnswGraph, +} + +impl<'a> LoadedHnswLevelView<'a> { + fn new(level: u16, graph: &'a LoadedHnswGraph) -> Self { + Self { + level: level as usize, + graph, + } + } +} + +impl Graph for LoadedHnswLevelView<'_> { + fn len(&self) -> usize { + // Mirrors `ImmutableHnswLevelView::len` (total node count). + self.graph.level_count[0] + } + + fn neighbors(&self, key: u32) -> Arc> { + // Non-hot fallback: HNSW search goes through `BorrowingGraph`. Kept + // only so the `Graph` trait / legacy `greedy_search` need no + // special-casing for loaded graphs. + Arc::new(self.graph.neighbors_at(self.level, key).to_vec()) + } +} + +impl BorrowingGraph for LoadedHnswLevelView<'_> { + fn len(&self) -> usize { + self.graph.level_count[0] + } + + fn neighbors(&self, key: u32) -> &[u32] { + self.graph.neighbors_at(self.level, key) + } +} + +/// Bottom-level (level 0) search view over a disk-loaded [`LoadedHnswGraph`]. +pub(crate) struct LoadedHnswBottomView<'a> { + graph: &'a LoadedHnswGraph, +} + +impl<'a> LoadedHnswBottomView<'a> { + fn new(graph: &'a LoadedHnswGraph) -> Self { + Self { graph } + } +} + +impl Graph for LoadedHnswBottomView<'_> { + fn len(&self) -> usize { + self.graph.level_count[0] + } + + fn neighbors(&self, key: u32) -> Arc> { + Arc::new(self.graph.neighbors_at(0, key).to_vec()) + } +} + +impl BorrowingGraph for LoadedHnswBottomView<'_> { + fn len(&self) -> usize { + self.graph.level_count[0] + } + + fn neighbors(&self, key: u32) -> &[u32] { + self.graph.neighbors_at(0, key) + } +} + +/// The graph backing an [`HNSW`]: either built in memory or disk-loaded. +enum HnswGraph { + /// Built in memory by the (online) builder / `index_vectors` / + /// `from_parts`. Mutable-shaped `GraphBuilderNode`s; `to_batch()` + /// re-encodes from these (it needs the per-node ranked distances). + Built(Arc>), + /// Loaded from disk, Arrow-backed, search-only. + Loaded(Arc), +} + +impl DeepSizeOf for HnswGraph { + fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + match self { + Self::Built(nodes) => nodes.deep_size_of_children(context), + Self::Loaded(graph) => graph.deep_size_of_children(context), + } + } +} + #[derive(Debug, Clone, Copy)] pub struct HnswQueryParams { pub ef: usize, @@ -760,39 +1002,81 @@ impl IvfSubIndex for HNSW { )) })?; - let levels: Vec<_> = hnsw_metadata + // Slice the concatenated batch into one (zero-copy) view per level. + let level_batches: Vec = hnsw_metadata .level_offsets .iter() .tuple_windows() .map(|(start, end)| data.slice(*start, end - start)) .collect(); - let level_count = levels.iter().map(|b| b.num_rows()).collect::>(); + let level_count = level_batches + .iter() + .map(|b| b.num_rows()) + .collect::>(); - let bottom_level_len = levels[0].num_rows(); - let mut nodes = Vec::with_capacity(bottom_level_len); - for i in 0..bottom_level_len { - nodes.push(GraphBuilderNode::new(i as u32, levels.len())); - } - for (level, batch) in levels.into_iter().enumerate() { + // No per-node reconstruction: keep the Arrow adjacency buffers as-is + // and only build the tiny per-upper-level id->row lookups. The + // `__distance` column is never materialized here -- search doesn't + // need it, and `to_batch()` returns the retained `data` verbatim. + let mut level_neighbors = Vec::with_capacity(level_batches.len()); + let mut level_lookup = Vec::with_capacity(level_batches.len()); + for (level, batch) in level_batches.iter().enumerate() { + // `.clone()` on an Arrow array bumps a refcount; buffers stay + // shared with `data` (zero copy). + let neighbors = batch[NEIGHBORS_COL].as_list::().clone(); let ids = batch[VECTOR_ID_COL].as_primitive::(); - let neighbors = batch[NEIGHBORS_COL].as_list::(); - let distances = batch[DIST_COL].as_list::(); - - for ((node, neighbors), distances) in - ids.iter().zip(neighbors.iter()).zip(distances.iter()) - { - let node = node.unwrap(); - let neighbors = neighbors.as_ref().unwrap().as_primitive::(); - let distances = distances.as_ref().unwrap().as_primitive::(); - - nodes[node as usize].level_neighbors_ranked[level] = neighbors + if level == 0 { + // `to_batch` writes every node at level 0 exactly once in + // ascending `__vector_id` (== node id) order, so the level-0 + // slice is exactly `[0, N)` and the row index *is* the node + // id. The `Dense` lookup below depends on this: in a release + // build a violated invariant would silently make search read + // the wrong neighbor list, so enforce it at load time (not via + // `debug_assert!`) and reject a malformed or version- + // incompatible batch. + if let Some((row, id)) = ids + .values() + .iter() + .enumerate() + .find(|&(row, id)| *id != row as u32) + { + return Err(Error::index(format!( + "HNSW level-0 __vector_id must equal the row index, but \ + row {row} has __vector_id {id}; the on-disk batch is \ + malformed or was written by an incompatible version" + ))); + } + level_lookup.push(LevelLookup::Dense); + } else { + // Upper levels: explicit id -> row map. No ordering/alignment + // assumption (see `LevelLookup::Sparse`). On the rare + // duplicate id (a misaligned slice can repeat one across a + // level boundary) the last wins, matching the old load's + // `nodes[id].level_neighbors[level] = ...` last-write. + let id_to_row: HashMap = ids + .values() .iter() - .zip(distances.iter()) - .map(|(n, dist)| OrderedNode::new(n.unwrap(), OrderedFloat(dist.unwrap()))) + .enumerate() + .map(|(row, id)| (*id, row as u32)) .collect(); - nodes[node as usize].update_from_ranked_neighbors(level as u16); + level_lookup.push(LevelLookup::Sparse(id_to_row)); } + level_neighbors.push(neighbors); + } + + // `entry_point` is read from untrusted metadata and indexes the `Dense` + // level-0 lookup directly; an out-of-range value would read past the + // level-0 neighbor buffer during search. Validate it under the same + // persisted-format invariant as the level-0 ids above. + let num_nodes = level_count[0]; + if hnsw_metadata.entry_point as usize >= num_nodes { + return Err(Error::index(format!( + "HNSW entry_point {} is out of range for a graph with {num_nodes} \ + nodes; the on-disk batch is malformed or was written by an \ + incompatible version", + hnsw_metadata.entry_point + ))); } let visited_generator_queue = @@ -802,9 +1086,16 @@ impl IvfSubIndex for HNSW { .push(VisitedGenerator::new(0)) .unwrap(); } + + let graph = LoadedHnswGraph { + batch: data, + level_neighbors, + level_lookup, + level_count: level_count.clone(), + }; let inner = HnswCore { params: hnsw_metadata.params, - nodes: Arc::new(nodes), + graph: HnswGraph::Loaded(Arc::new(graph)), level_count, entry_point: hnsw_metadata.entry_point, visited_generator_queue, @@ -941,6 +1232,28 @@ impl IvfSubIndex for HNSW { /// Encode the sub index into a record batch fn to_batch(&self) -> Result { + let nodes = match &self.inner.graph { + HnswGraph::Built(nodes) => nodes, + HnswGraph::Loaded(graph) => { + // A loaded graph is already Arrow-backed: return the retained + // batch verbatim, re-stamped with up-to-date HNSW metadata. + // The IVF partition cache re-serializes loaded indices through + // here (`ivf/partition_serde.rs`), so this must round-trip. + let metadata = serde_json::to_string(&self.metadata())?; + let schema = + graph + .batch + .schema() + .as_ref() + .clone() + .with_metadata(HashMap::from_iter(vec![( + HNSW_METADATA_KEY.to_string(), + metadata, + )])); + return Ok(graph.batch.clone().with_schema(Arc::new(schema))?); + } + }; + let mut vector_id_builder = UInt32Builder::with_capacity(self.len()); let mut neighbors_builder = ListBuilder::with_capacity(UInt32Builder::new(), self.len()); let mut distances_builder = @@ -948,7 +1261,7 @@ impl IvfSubIndex for HNSW { let mut batches = Vec::with_capacity(self.max_level() as usize); for level in 0..self.max_level() { let level = level as usize; - for (id, node) in self.inner.nodes.iter().enumerate() { + for (id, node) in nodes.iter().enumerate() { if level >= node.level_neighbors.len() { continue; } @@ -991,8 +1304,9 @@ impl IvfSubIndex for HNSW { mod tests { use std::sync::Arc; - use arrow_array::{FixedSizeListArray, UInt8Array}; + use arrow_array::{ArrayRef, FixedSizeListArray, RecordBatch, UInt8Array, UInt32Array}; use arrow_schema::Schema; + use deepsize::DeepSizeOf; use lance_arrow::FixedSizeListArrayExt; use lance_file::previous::{ reader::FileReader as PreviousFileReader, @@ -1006,8 +1320,11 @@ mod tests { use lance_table::io::manifest::ManifestDescribing; use lance_testing::datagen::generate_random_array; use object_store::path::Path; + use rstest::rstest; + use super::HnswGraph; use crate::scalar::IndexWriter; + use crate::vector::storage::{DistCalculator, VectorStore}; use crate::vector::v3::subindex::IvfSubIndex; use crate::vector::{ flat::storage::{FlatBinStorage, FlatFloatStorage}, @@ -1140,4 +1457,330 @@ mod tests { .unwrap(); assert_eq!(builder_results, loaded_results); } + + /// Brute-force top-`k` node ids by distance -- recall ground truth. + fn brute_force_topk(store: &FlatFloatStorage, query: ArrayRef, k: usize) -> Vec { + let dist_calc = store.dist_calculator(query, 0.0); + let mut all: Vec<(f32, u32)> = (0..store.len() as u32) + .map(|id| (dist_calc.distance(id), id)) + .collect(); + all.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + all.into_iter().take(k).map(|(_, id)| id).collect() + } + + /// The Arrow-backed loaded graph must search bit-identically to the + /// in-memory build, across distance types and graph sizes (single node, + /// pair, and a multi-level graph exercising the sparse upper-level + /// id->row lookup). + #[rstest] + #[case::l2_single(DistanceType::L2, 1)] + #[case::l2_pair(DistanceType::L2, 2)] + #[case::l2_multi_level(DistanceType::L2, 2048)] + #[case::dot_multi_level(DistanceType::Dot, 2048)] + #[tokio::test] + async fn test_loaded_search_parity_and_recall( + #[case] distance_type: DistanceType, + #[case] total: usize, + ) { + const DIM: usize = 32; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(total * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl.clone(), distance_type)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(20).ef_construction(50), + ) + .unwrap(); + assert!(!matches!(builder.inner.graph, HnswGraph::Loaded(_))); + + let loaded = HNSW::load(builder.to_batch().unwrap()).unwrap(); + assert!(matches!(loaded.inner.graph, HnswGraph::Loaded(_))); + assert_eq!(loaded.len(), total); + + let k = total.min(10); + let params = HnswQueryParams { + ef: 50, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }; + let query = fsl.value(0); + + let builder_results = builder + .search_basic(query.clone(), k, ¶ms, None, store.as_ref()) + .unwrap(); + let loaded_results = loaded + .search_basic(query.clone(), k, ¶ms, None, store.as_ref()) + .unwrap(); + assert_eq!(builder_results, loaded_results); + + // Recall vs brute-force ground truth (project rule: >= 0.5). + let truth: std::collections::HashSet = brute_force_topk(store.as_ref(), query, k) + .into_iter() + .collect(); + let hits = loaded_results + .iter() + .filter(|n| truth.contains(&n.id)) + .count(); + let recall = hits as f32 / k as f32; + assert!(recall >= 0.5, "recall {recall} below 0.5 (k={k})"); + } + + /// Regression guard for the `level_offsets` misalignment (issue #6746). + /// `to_batch` writes the entry-point node at *every* level, but + /// `level_count` only counts it at level 0, so the serialized batch has + /// strictly more rows than `sum(level_count)` and the upper-level + /// `level_offsets` slices are off-by-one / non-monotonic. The Arrow-backed + /// loaded graph must still search bit-identically to the in-memory build: + /// it keys upper levels by `__vector_id` value via the `Sparse` map + /// (last-write-wins), never `row == id`. A naive `row == id` + /// reimplementation would pass the small cases but break here. + #[tokio::test] + async fn test_loaded_level_offsets_misalignment_invariant() { + use arrow::array::AsArray; + use arrow::datatypes::UInt32Type; + + const DIM: usize = 32; + const TOTAL: usize = 2048; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(TOTAL * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(20).ef_construction(50), + ) + .unwrap(); + + // The scenario only exists on a multi-level graph. + assert!( + builder.max_level() >= 2, + "expected a multi-level graph (got max_level {})", + builder.max_level() + ); + + let batch = builder.to_batch().unwrap(); + let md = builder.metadata(); + let total_counted = *md.level_offsets.last().unwrap(); + + // The exact misalignment: more serialized rows than `level_count` sums + // to, because the entry-point node is written at every level yet + // counted only at level 0. + assert!( + batch.num_rows() > total_counted, + "expected serialized rows ({}) to exceed sum(level_count) ({}) -- \ + entry point should be written at every level", + batch.num_rows(), + total_counted, + ); + + // Level-0 slice must still be exactly `[0, N)` with + // `__vector_id == row` -- the precondition for `LevelLookup::Dense`. + let n = md.level_offsets[1]; + assert_eq!(n, TOTAL); + let level0 = batch.slice(0, n); + let ids = level0.column(0).as_primitive::(); + assert!( + ids.values() + .iter() + .enumerate() + .all(|(row, id)| *id == row as u32), + "level-0 __vector_id must equal the row index", + ); + + // Despite the surplus rows and off-by-one upper slices, the loaded + // graph searches bit-identically to the in-memory build (old `load` + // semantics preserved via the `Sparse` last-write-wins map). + let loaded = HNSW::load(batch).unwrap(); + assert!(matches!(loaded.inner.graph, HnswGraph::Loaded(_))); + let params = HnswQueryParams { + ef: 50, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }; + let query = fsl.value(0); + let builder_results = builder + .search_basic(query.clone(), 10, ¶ms, None, store.as_ref()) + .unwrap(); + let loaded_results = loaded + .search_basic(query, 10, ¶ms, None, store.as_ref()) + .unwrap(); + assert_eq!(builder_results, loaded_results); + } + + /// `load()` must reject a batch whose level-0 `__vector_id` no longer + /// matches the row index. The `LevelLookup::Dense` fast path relies on + /// `row == id`, and the old `debug_assert!` was compiled out of release + /// builds -- so a corrupt batch must fail at the `load()` boundary instead + /// of silently searching the wrong neighbor lists. + #[tokio::test] + async fn test_load_rejects_misaligned_level0_id() { + use arrow::array::AsArray; + use arrow::datatypes::UInt32Type; + + const DIM: usize = 16; + const TOTAL: usize = 256; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(TOTAL * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl, DistanceType::L2)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(20).ef_construction(50), + ) + .unwrap(); + + let batch = builder.to_batch().unwrap(); + // Row 0 is always a level-0 node; break its `__vector_id == row` + // invariant while preserving the (metadata-bearing) schema. + let mut ids = batch + .column(0) + .as_primitive::() + .values() + .to_vec(); + ids[0] = ids.len() as u32; + let mut columns = batch.columns().to_vec(); + columns[0] = Arc::new(UInt32Array::from(ids)); + let corrupted = RecordBatch::try_new(batch.schema(), columns).unwrap(); + + assert!( + HNSW::load(corrupted).is_err(), + "load() must reject a misaligned level-0 __vector_id" + ); + } + + /// `load()` must reject metadata whose `entry_point` is out of range for + /// the node count: it indexes the `Dense` level-0 lookup directly, so an + /// out-of-range value would read past the level-0 neighbor buffer at search + /// time. + #[tokio::test] + async fn test_load_rejects_out_of_range_entry_point() { + use super::{HNSW_METADATA_KEY, HnswMetadata}; + + const DIM: usize = 16; + const TOTAL: usize = 256; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(TOTAL * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl, DistanceType::L2)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(20).ef_construction(50), + ) + .unwrap(); + + let batch = builder.to_batch().unwrap(); + let mut metadata = batch.schema_ref().metadata().clone(); + let mut md: HnswMetadata = + serde_json::from_str(metadata.get(HNSW_METADATA_KEY).unwrap()).unwrap(); + // Valid entry points are `[0, N)`; `level_offsets[1]` == N is one past. + let n = md.level_offsets[1]; + md.entry_point = n as u32; + metadata.insert( + HNSW_METADATA_KEY.to_string(), + serde_json::to_string(&md).unwrap(), + ); + // Rebuild the batch under the rewritten metadata. `with_schema` would + // reject this: it requires the new metadata to be a superset, but we + // are changing an existing key's value, not adding one. + let schema = batch.schema().as_ref().clone().with_metadata(metadata); + let corrupted = RecordBatch::try_new(Arc::new(schema), batch.columns().to_vec()).unwrap(); + + assert!( + HNSW::load(corrupted).is_err(), + "load() must reject an out-of-range entry_point" + ); + } + + /// An empty index round-trips: 0-row `to_batch` -> `load` -> empty graph. + #[tokio::test] + async fn test_loaded_empty_index() { + const DIM: usize = 16; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(0), DIM as i32).unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl, DistanceType::L2)); + let builder = HNSW::index_vectors(store.as_ref(), HnswBuildParams::default()).unwrap(); + assert!(builder.is_empty()); + + let batch = builder.to_batch().unwrap(); + assert_eq!(batch.num_rows(), 0); + + let loaded = HNSW::load(batch).unwrap(); + assert!(loaded.is_empty()); + assert_eq!(loaded.len(), 0); + // A 0-row load short-circuits to the empty (Built) graph. + assert!(!matches!(loaded.inner.graph, HnswGraph::Loaded(_))); + assert_eq!(loaded.to_batch().unwrap().num_rows(), 0); + } + + /// build -> `to_batch` (b1) -> `load` -> `to_batch` (b2) must satisfy + /// `b1 == b2`, and the round-tripped batch must reload and search + /// identically. This is exactly the IVF partition-cache path: + /// `ivf/partition_serde.rs` calls `to_batch()` on a *loaded* index. + #[tokio::test] + async fn test_to_batch_roundtrip_loaded() { + const DIM: usize = 24; + const TOTAL: usize = 1500; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(TOTAL * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(16).ef_construction(50), + ) + .unwrap(); + + let b1 = builder.to_batch().unwrap(); + let loaded = HNSW::load(b1.clone()).unwrap(); + assert!(matches!(loaded.inner.graph, HnswGraph::Loaded(_))); + let b2 = loaded.to_batch().unwrap(); + assert_eq!(b1, b2); + + let reloaded = HNSW::load(b2).unwrap(); + let params = HnswQueryParams { + ef: 50, + lower_bound: None, + upper_bound: None, + dist_q_c: 0.0, + }; + let query = fsl.value(7); + let a = builder + .search_basic(query.clone(), 10, ¶ms, None, store.as_ref()) + .unwrap(); + let b = reloaded + .search_basic(query, 10, ¶ms, None, store.as_ref()) + .unwrap(); + assert_eq!(a, b); + } + + /// The loaded graph shares the Arrow batch and reconstructs no per-node + /// `Vec` / `Vec`, so it is strictly + /// lighter than the in-memory build representation. + #[tokio::test] + async fn test_loaded_graph_is_arrow_backed() { + const DIM: usize = 32; + const TOTAL: usize = 2048; + let fsl = + FixedSizeListArray::try_new_from_values(generate_random_array(TOTAL * DIM), DIM as i32) + .unwrap(); + let store = Arc::new(FlatFloatStorage::new(fsl, DistanceType::L2)); + let builder = HNSW::index_vectors( + store.as_ref(), + HnswBuildParams::default().num_edges(20).ef_construction(50), + ) + .unwrap(); + assert!(!matches!(builder.inner.graph, HnswGraph::Loaded(_))); + + let loaded = HNSW::load(builder.to_batch().unwrap()).unwrap(); + assert!(matches!(loaded.inner.graph, HnswGraph::Loaded(_))); + assert!( + loaded.deep_size_of() < builder.deep_size_of(), + "loaded graph ({}) should be lighter than built ({})", + loaded.deep_size_of(), + builder.deep_size_of(), + ); + } } From 4a7d21ef4c42c00ce0bf8d805b4706bb4d6632eb Mon Sep 17 00:00:00 2001 From: Vivek Date: Wed, 3 Jun 2026 10:07:09 -0700 Subject: [PATCH 005/177] fix(fts): handle empty query tokens in flat full-text search (#7046) --- rust/lance-index/src/scalar/inverted/index.rs | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 5b67ba1da9b..520d46a5097 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -4797,6 +4797,17 @@ pub async fn flat_bm25_search_stream_with_metrics( let mut tokenizer = tokenizer; let query_tokens = Arc::new(collect_query_tokens(&query, &mut tokenizer)); + // A query that tokenizes to no terms (e.g. only stop words) has no + // searchable content and matches nothing. Return early rather than + // proceeding. This mirrors the indexed search path, which already + // short-circuits on empty query tokens. + if query_tokens.is_empty() { + return Ok(Box::pin(RecordBatchStreamAdapter::new( + FTS_SCHEMA.clone(), + stream::empty::>(), + ))); + } + let input_schema = input.schema(); let doc_col_idx = input_schema.index_of(&doc_col)?; @@ -4882,6 +4893,9 @@ mod tests { use std::collections::HashMap; use std::sync::Arc; + use crate::scalar::inverted::tokenizer::document_tokenizer::TextTokenizer; + use lance_tokenizer::{Language, SimpleTokenizer, StopWordFilter, TextAnalyzer}; + use super::*; async fn write_single_partition_index( @@ -7281,4 +7295,56 @@ mod tests { "warm-cache scores must match cold" ); } + + #[tokio::test] + async fn flat_bm25_search_stop_word_query_over_unindexed_rows_returns_empty() { + let schema = Arc::new(Schema::new(vec![ + ROW_ID_FIELD.clone(), + Field::new("text", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1, 2])), + Arc::new(StringArray::from(vec![ + "the quick brown fox", + "a lazy dog", + "for the win", + ])), + ], + ) + .unwrap(); + + let input: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema.clone(), + stream::iter(vec![Ok(batch)]), + )); + + // Analyzer with an English stop-word filter, so the query "the" + // tokenizes to zero terms -- exactly the production trigger. + let tokenizer: Box = Box::new(TextTokenizer::new( + TextAnalyzer::builder(SimpleTokenizer::default()) + .filter(StopWordFilter::new(Language::English).unwrap()) + .build(), + )); + + let result_stream = flat_bm25_search_stream_with_metrics( + input, + "text".to_string(), + "the".to_string(), + tokenizer, + None, + 100, + None, + ) + .await + .unwrap(); + + let batches: Vec<_> = result_stream.try_collect().await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!( + total_rows, 0, + "a stop-word-only query has no searchable terms and must match nothing" + ); + } } From 549ce37d73951e59da2d3118620cb474326e455e Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 3 Jun 2026 12:26:45 -0700 Subject: [PATCH 006/177] perf(index): fix O(N log N) warm-cache regression in BitmapIndex (#7079) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Commit 4de5ce67d ("feat(index): serializable cache for Bitmap and LabelList scalar indices #6874") introduced a performance regression in `BitmapIndexPlugin::get_from_cache`. Every warm-cache hit against a bitmap scalar index now pays O(N log N) cost where N is the number of unique values in the column, instead of O(1). The regression: the new implementation stored only the serializable `BitmapIndexState` (an Arrow `RecordBatch`) in the cache and reconstructed the full `BTreeMap` on every cache hit by calling `parse_lookup_batch`. For a column with 10M unique values this rebuilds the map on every query — including `IS NULL`, whose actual bitmap lookup is `(*self.null_map).clone()` and is otherwise O(1). `parse_lookup_batch` is expensive because: 1. It calls `ScalarValue::try_from_array` for every row — one heap allocation per unique value. 2. It inserts into a `BTreeMap` — O(log N) comparisons per insert, O(N log N) total. ## Fix **`BitmapIndex.index_map`**: Changed from `BTreeMap` to `Arc>`. The map is immutable after construction, so sharing it behind an `Arc` is safe, and cloning is O(1). **`BitmapIndexState`**: Added an `index_map: Arc>` field that is **not serialized** — the wire format is unchanged. It is populated eagerly: - `from_index` (called by `put_in_cache`): `Arc::clone`s the map from the live `BitmapIndex` — O(1). - `deserialize` (disk-backed cache backends): calls `parse_lookup_batch` once at deserialization time, which is already paying disk I/O cost. **`into_bitmap_index`**: Now takes `&self` and simply `Arc::clone`s `self.index_map` — always O(1), no reconstruction. **`get_from_cache`**: The intermediate `(*state).clone()` is removed since `into_bitmap_index` no longer consumes `self`. `LabelListIndex` had the same dual-entry patch applied in a prior iteration; that is also reverted to the original single-entry approach (its `BitmapIndexState` path is unchanged by this PR). ## Test Added `test_bitmap_cache_fast_path` to `bitmap.rs`: - Creates a high-cardinality bitmap index (1 000 unique integers + 5 null rows) - Calls `put_in_cache`, then `get_from_cache` - Asserts `get_from_cache` returns `Some` - Runs `IS NULL` and asserts the correct 5 null rows are returned To measure the end-to-end impact, run the `bitmap / is_null / warm` case in `python/python/ci_benchmarks/benchmarks/test_count_rows.py` — latency should be close to `btree / is_null / warm`. --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance-index/src/scalar/bitmap.rs | 121 ++++++++++++++++++---- rust/lance-index/src/scalar/label_list.rs | 2 +- 2 files changed, 103 insertions(+), 20 deletions(-) diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 228a522c634..10254e699c5 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -116,7 +116,7 @@ pub struct BitmapIndex { /// Maps each unique value to its bitmap location in the index file /// The usize value is the row offset in the bitmap_page_lookup.lance file /// for quickly locating the row and reading it out - index_map: BTreeMap, + index_map: Arc>, null_map: Arc, @@ -173,11 +173,17 @@ pub struct BitmapIndexState { /// Cached separately from the schema for the empty-index case where the /// `lookup_batch` is empty but we still need to remember the column type. value_type: DataType, + /// Parsed form of `lookup_batch`. Not serialized — populated eagerly in + /// both [`BitmapIndexState::from_index`] and [`CacheCodecImpl::deserialize`]. + /// Stored as `Arc` so cloning into a new [`BitmapIndex`] is O(1). + index_map: Arc>, } impl DeepSizeOf for BitmapIndexState { fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.lookup_batch.get_array_memory_size() + self.null_map.deep_size_of_children(context) + self.lookup_batch.get_array_memory_size() + + self.null_map.deep_size_of_children(context) + + self.index_map.deep_size_of_children(context) } } @@ -187,20 +193,20 @@ impl BitmapIndexState { lookup_batch: build_lookup_batch(&index.index_map, &index.value_type)?, null_map: index.null_map.clone(), value_type: index.value_type.clone(), + index_map: index.index_map.clone(), }) } - pub(crate) fn into_bitmap_index( - self, + pub(crate) fn to_bitmap_index( + &self, store: Arc, index_cache: &LanceCache, frag_reuse_index: Option>, ) -> Result> { - let index_map = parse_lookup_batch(&self.lookup_batch)?; Ok(Arc::new(BitmapIndex::new( - index_map, - self.null_map, - self.value_type, + self.index_map.clone(), + self.null_map.clone(), + self.value_type.clone(), store, WeakLanceCache::from(index_cache), frag_reuse_index, @@ -265,10 +271,12 @@ impl CacheCodecImpl for BitmapIndexState { let null_map = Arc::new(RowAddrTreeMap::deserialize_from(null_bytes.as_ref())?); let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; let value_type = lookup_batch.schema().field(0).data_type().clone(); + let index_map = Arc::new(parse_lookup_batch(&lookup_batch)?); Ok(Self { lookup_batch, null_map, value_type, + index_map, }) } } @@ -295,7 +303,7 @@ impl CacheKey for BitmapIndexStateKey { impl BitmapIndex { fn new( - index_map: BTreeMap, + index_map: Arc>, null_map: Arc, value_type: DataType, store: Arc, @@ -326,7 +334,7 @@ impl BitmapIndex { let schema = page_lookup_file.schema(); let data_type = schema.fields[0].data_type(); return Ok(Arc::new(Self::new( - BTreeMap::new(), + Arc::new(BTreeMap::new()), Arc::new(RowAddrTreeMap::default()), data_type, store, @@ -381,7 +389,7 @@ impl BitmapIndex { } Ok(Arc::new(Self::new( - index_map, + Arc::new(index_map), null_map, value_type, store, @@ -466,12 +474,7 @@ impl BitmapIndex { impl DeepSizeOf for BitmapIndex { fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - let mut total_size = 0; - - total_size += self.index_map.deep_size_of_children(context); - total_size += self.store.deep_size_of_children(context); - - total_size + self.index_map.deep_size_of_children(context) + self.store.deep_size_of_children(context) } } @@ -1754,8 +1757,7 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { let Some(state) = cache.get_with_key(&BitmapIndexStateKey).await else { return Ok(None); }; - let state = (*state).clone(); - let index = state.into_bitmap_index(index_store, cache, frag_reuse_index)?; + let index = state.to_bitmap_index(index_store, cache, frag_reuse_index)?; Ok(Some(index as Arc)) } @@ -1843,6 +1845,7 @@ mod tests { lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(), null_map: Arc::new(null_map), value_type: DataType::Int32, + index_map: Arc::new(index_map), }; assert_state_roundtrips(&state); @@ -1851,6 +1854,7 @@ mod tests { lookup_batch: build_lookup_batch(&BTreeMap::new(), &DataType::Utf8).unwrap(), null_map: Arc::new(RowAddrTreeMap::new()), value_type: DataType::Utf8, + index_map: Arc::new(BTreeMap::new()), }; assert_state_roundtrips(&empty_state); } @@ -1990,6 +1994,85 @@ mod tests { } } + // Regression test for the O(N log N) warm-cache rebuild introduced in + // commit 4de5ce67d. BitmapIndexState now caches the parsed Arc + // so that get_from_cache skips parse_lookup_batch on warm hits. + // IS NULL is the worst case: the actual bitmap lookup is O(1) but + // reconstruction of the BTreeMap touched every row in the lookup batch. + #[tokio::test] + async fn test_bitmap_cache_fast_path() { + use arrow_array::Int32Array; + + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // High-cardinality: 1 000 unique integers + 5 null rows. + const N: u64 = 1_000; + const NULL_COUNT: u64 = 5; + // nulls first (sorted batch: nulls precede values) + let null_values: Vec> = + std::iter::repeat_n(None, NULL_COUNT as usize).collect(); + let non_null_values: Vec> = (0..N as i32).map(Some).collect(); + let all_values: Vec> = null_values.into_iter().chain(non_null_values).collect(); + let all_row_ids: Vec = (0..N + NULL_COUNT).collect(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int32, true), + Field::new("_rowid", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(all_values)), + Arc::new(UInt64Array::from(all_row_ids)), + ], + ) + .unwrap(); + let stream = stream::once(async move { Ok(batch) }); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + BitmapIndexPlugin::train_bitmap_index(stream, store.as_ref()) + .await + .unwrap(); + + let cache = LanceCache::with_capacity(16 * 1024 * 1024); + let index = BitmapIndex::load(store.clone(), None, &cache) + .await + .unwrap(); + + let plugin = BitmapIndexPlugin; + let index_arc: Arc = index.clone() as Arc; + plugin.put_in_cache(&cache, index_arc).await.unwrap(); + + // get_from_cache must return Some, and the BitmapIndexState's OnceLock + // must have been populated by put_in_cache so no parse_lookup_batch occurs. + let cached = plugin + .get_from_cache(store.clone(), None, &cache) + .await + .unwrap() + .expect("get_from_cache must return Some after put_in_cache"); + + // IS NULL: trivial work once the index is in hand. + let query = SargableQuery::IsNull(); + match cached.search(&query, &NoOpMetricsCollector).await.unwrap() { + SearchResult::Exact(row_set) => { + let mut null_rows: Vec = row_set + .true_rows() + .row_addrs() + .unwrap() + .map(u64::from) + .collect(); + null_rows.sort(); + let expected: Vec = (0..NULL_COUNT).collect(); + assert_eq!(null_rows, expected); + } + _ => panic!("Expected Exact result for IS NULL"), + } + } + #[tokio::test] #[ignore] async fn test_big_bitmap_index() { diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index d0055e201ac..86b1bd6d3df 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -524,7 +524,7 @@ impl LabelListIndexState { ) -> Result> { let bitmap = self .bitmap_state - .into_bitmap_index(store, index_cache, frag_reuse_index)?; + .to_bitmap_index(store, index_cache, frag_reuse_index)?; Ok(Arc::new(LabelListIndex::new(bitmap, self.list_nulls))) } } From 24469e1f064d0449d498b31dee0b6a35712af155 Mon Sep 17 00:00:00 2001 From: Prashanth Rao <35005448+prrao87@users.noreply.github.com> Date: Wed, 3 Jun 2026 16:26:39 -0400 Subject: [PATCH 007/177] docs: fix broken pre-release install commands (#7088) Remove redundant trailing slashes and update pylance pre-release pip install command in README.md --- README.md | 2 +- docs/src/quickstart/index.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2f0b2bca18f..886fd70425e 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ pip install pylance To install a preview release: ```shell -pip install --pre --extra-index-url https://pypi.fury.io/lance-format/pylance +pip install --pre --extra-index-url https://pypi.fury.io/lance-format pylance ``` > [!TIP] diff --git a/docs/src/quickstart/index.md b/docs/src/quickstart/index.md index 606948263c4..34367c7177f 100644 --- a/docs/src/quickstart/index.md +++ b/docs/src/quickstart/index.md @@ -22,17 +22,17 @@ For the latest features and bug fixes, you can install the preview version: === "pip" ```bash - pip install --pre --extra-index-url https://pypi.fury.io/lance-format/ pylance + pip install --pre --extra-index-url https://pypi.fury.io/lance-format pylance ``` === "uv" ```bash uv venv - uv pip install --prerelease allow --index https://pypi.fury.io/lance-format/ pylance + uv pip install --prerelease allow --index https://pypi.fury.io/lance-format pylance # To add to pyproject.toml, just do: - uv add --prerelease allow --index https://pypi.fury.io/lance-format/ pylance + uv add --prerelease allow --index https://pypi.fury.io/lance-format pylance ``` !!! note From 73025ce77cc888b04e6a52f537602326fee12f4f Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Wed, 3 Jun 2026 13:42:26 -0700 Subject: [PATCH 008/177] feat: use indexes to accelerate filtered count_rows (#6916) Co-authored-by: Claude Opus 4.7 (1M context) --- .../benchmarks/test_count_rows.py | 133 +++ .../ci_benchmarks/datagen/count_rows.py | 98 +++ .../python/ci_benchmarks/datagen/gen_all.py | 4 + python/python/lance/dataset.py | 10 +- python/python/lance/lance/__init__.pyi | 2 +- python/python/tests/test_count_pushdown.py | 178 ++++ python/src/scanner.rs | 15 +- rust/lance-index/src/scalar/expression.rs | 55 +- rust/lance-index/src/scalar/json.rs | 2 + rust/lance/src/dataset/scanner.rs | 20 + .../src/dataset/tests/dataset_aggregate.rs | 132 ++- rust/lance/src/index.rs | 63 ++ rust/lance/src/io/exec.rs | 2 + rust/lance/src/io/exec/count_from_mask.rs | 624 ++++++++++++++ rust/lance/src/io/exec/count_pushdown.rs | 798 ++++++++++++++++++ rust/lance/src/io/exec/optimizer.rs | 4 + rust/lance/src/io/exec/scalar_index.rs | 5 + rust/lance/tests/count_pushdown/mod.rs | 183 ++++ rust/lance/tests/integration_tests.rs | 1 + 19 files changed, 2310 insertions(+), 19 deletions(-) create mode 100644 python/python/ci_benchmarks/benchmarks/test_count_rows.py create mode 100644 python/python/ci_benchmarks/datagen/count_rows.py create mode 100644 python/python/tests/test_count_pushdown.py create mode 100644 rust/lance/src/io/exec/count_from_mask.rs create mode 100644 rust/lance/src/io/exec/count_pushdown.rs create mode 100644 rust/lance/tests/count_pushdown/mod.rs diff --git a/python/python/ci_benchmarks/benchmarks/test_count_rows.py b/python/python/ci_benchmarks/benchmarks/test_count_rows.py new file mode 100644 index 00000000000..f6228b04f13 --- /dev/null +++ b/python/python/ci_benchmarks/benchmarks/test_count_rows.py @@ -0,0 +1,133 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Benchmark count_rows acceleration for IS NULL / IS NOT NULL filters. + +Tests five index configurations against an int32 dataset with ~1% NULL +values. Each configuration stores the same data in a separate column so +that only one index type is active per measurement: + + none — no index, full column scan (baseline) + BITMAP — bitmap index + BTREE — btree index + ZONEMAP — zone-map index + BLOOMFILTER — bloom-filter index + +Two filters are exercised for each configuration: + IS NULL — count the ~1% null rows + IS NOT NULL — count the ~99% non-null rows + +Indexed configurations are tested in two cache states to separate first-load +latency from steady-state throughput: + + warm — one prewarm call is made before measuring; the same dataset instance + is reused so its in-memory index cache is already populated. + cold — a fresh ``lance.dataset()`` instance is created inside each measured + round so the in-memory index cache starts empty every time. No + prewarm pass is performed. +""" + +from __future__ import annotations + +import lance +import pytest +from ci_benchmarks.datasets import get_dataset_uri + +# --------------------------------------------------------------------------- +# Parameters +# --------------------------------------------------------------------------- + +# Indexed configs only (warm/cold dimension applies to these) +_INDEXED_CONFIGS: list[tuple[str, str]] = [ + ("bitmap", "value_bitmap"), + ("btree", "value_btree"), + ("zonemap", "value_zonemap"), + ("bloomfilter", "value_bloomfilter"), +] +_INDEXED_IDS = [cfg[0] for cfg in _INDEXED_CONFIGS] + +_FILTERS = ["is_null", "is_not_null"] + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def count_rows_ds() -> lance.LanceDataset: + """Shared dataset instance (index cache persists across rounds — use for warm).""" + return lance.dataset(get_dataset_uri("count_rows")) + + +@pytest.fixture(scope="module") +def count_rows_uri() -> str: + return get_dataset_uri("count_rows") + + +# --------------------------------------------------------------------------- +# No-index baseline (no warm/cold — there is no index cache to speak of) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("filter_type", _FILTERS) +def test_count_rows_no_index( + benchmark, + count_rows_ds: lance.LanceDataset, + filter_type: str, +) -> None: + """Full-scan baseline with no scalar index.""" + filt = ( + "value_none IS NULL" if filter_type == "is_null" else "value_none IS NOT NULL" + ) + + def bench() -> int: + return count_rows_ds.count_rows(filter=filt) + + benchmark.pedantic(bench, warmup_rounds=1, rounds=5) + + +# --------------------------------------------------------------------------- +# Indexed benchmarks — warm vs cold +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("warm", [True, False], ids=["warm", "cold"]) +@pytest.mark.parametrize("filter_type", _FILTERS) +@pytest.mark.parametrize("index_id,column", _INDEXED_CONFIGS, ids=_INDEXED_IDS) +def test_count_rows_indexed( + benchmark, + count_rows_ds: lance.LanceDataset, + count_rows_uri: str, + index_id: str, + column: str, + filter_type: str, + warm: bool, +) -> None: + """Benchmark count_rows with a scalar index, in warm and cold cache states. + + Args: + index_id: Human-readable index name (parametrize label only). + column: Dataset column that carries this index type. + filter_type: ``"is_null"`` or ``"is_not_null"``. + warm: If True, prewarm the index cache before measuring and reuse the + shared dataset instance. If False, create a fresh dataset + instance on every round so the index cache starts empty. + """ + filt = f"{column} IS NULL" if filter_type == "is_null" else f"{column} IS NOT NULL" + + if warm: + + def bench() -> int: + return count_rows_ds.count_rows(filter=filt) + + # warmup_rounds=1 makes one unmeasured call that populates the cache. + benchmark.pedantic(bench, warmup_rounds=1, rounds=5) + else: + + def bench() -> int: + # Fresh instance → empty in-memory index cache every round. + ds = lance.dataset(count_rows_uri) + return ds.count_rows(filter=filt) + + benchmark.pedantic(bench, warmup_rounds=0, rounds=5) diff --git a/python/python/ci_benchmarks/datagen/count_rows.py b/python/python/ci_benchmarks/datagen/count_rows.py new file mode 100644 index 00000000000..ff5fa02cf8a --- /dev/null +++ b/python/python/ci_benchmarks/datagen/count_rows.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Generate the count_rows benchmark dataset. + +Creates a 10-million-row Lance dataset with five int32 columns that all hold the +same values and the same ~1% null mask. Each column carries a different scalar +index so the benchmark can compare no-index, BITMAP, BTREE, ZONEMAP, and +BLOOMFILTER side-by-side on the same underlying data. + +Column layout +------------- +value_none — no index (full-scan baseline) +value_bitmap — BITMAP index +value_btree — BTREE index +value_zonemap — ZONEMAP index +value_bloomfilter — BLOOMFILTER index + +Null pattern: row i is null when i % 100 == 0 (~1% of rows exactly). +""" + +import lance +import numpy as np +import pyarrow as pa +from lance.log import LOGGER + +from ci_benchmarks.datasets import get_dataset_uri + +NUM_ROWS = 10_000_000 +BATCH_SIZE = 1_000_000 # 1 M rows per batch → 10 batches total + +COLUMNS = [ + "value_none", + "value_bitmap", + "value_btree", + "value_zonemap", + "value_bloomfilter", +] + +SCHEMA = pa.schema([(col, pa.int32()) for col in COLUMNS]) + + +def _gen_data(): + num_batches = NUM_ROWS // BATCH_SIZE + for batch_idx in range(num_batches): + offset = batch_idx * BATCH_SIZE + values = np.arange(offset, offset + BATCH_SIZE, dtype=np.int32) + # Null mask: True where the value should be null (~1% of rows) + null_mask = (np.arange(BATCH_SIZE) + offset) % 100 == 0 + col = pa.array(values, type=pa.int32(), mask=null_mask) + yield pa.record_batch([col] * len(COLUMNS), schema=SCHEMA) + + +def gen_count_rows() -> lance.LanceDataset: + dataset_uri = get_dataset_uri("count_rows") + + try: + ds = lance.dataset(dataset_uri) + if ds.count_rows() == NUM_ROWS: + LOGGER.info( + "count_rows dataset already exists at %s (%d rows)", + dataset_uri, + NUM_ROWS, + ) + return ds + LOGGER.warning( + "count_rows dataset at %s has unexpected row count %d; regenerating", + dataset_uri, + ds.count_rows(), + ) + except Exception: + pass + + LOGGER.info( + "Writing count_rows dataset (%d rows, %d columns) to %s", + NUM_ROWS, + len(COLUMNS), + dataset_uri, + ) + ds = lance.write_dataset( + _gen_data(), + dataset_uri, + schema=SCHEMA, + mode="overwrite", + ) + LOGGER.info("Dataset written; building scalar indexes …") + + for index_type, column in [ + ("BITMAP", "value_bitmap"), + ("BTREE", "value_btree"), + ("ZONEMAP", "value_zonemap"), + ("BLOOMFILTER", "value_bloomfilter"), + ]: + LOGGER.info(" Creating %s index on %s …", index_type, column) + ds.create_scalar_index(column, index_type) + + LOGGER.info("count_rows dataset ready.") + return ds diff --git a/python/python/ci_benchmarks/datagen/gen_all.py b/python/python/ci_benchmarks/datagen/gen_all.py index 1da7c05fd9b..d5120d20ff7 100644 --- a/python/python/ci_benchmarks/datagen/gen_all.py +++ b/python/python/ci_benchmarks/datagen/gen_all.py @@ -6,6 +6,7 @@ from lance.log import LOGGER from ci_benchmarks.datagen.basic import gen_basic +from ci_benchmarks.datagen.count_rows import gen_count_rows from ci_benchmarks.datagen.lineitems import gen_tcph from ci_benchmarks.datagen.wikipedia import gen_wikipedia @@ -40,6 +41,9 @@ def setup_logging(): LOGGER.info("Generating Wikipedia dataset...") gen_wikipedia() + LOGGER.info("Generating count_rows benchmark dataset...") + gen_count_rows() + LOGGER.info("=" * 80) LOGGER.info("All datasets generated successfully!") LOGGER.info("=" * 80) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 743f5f580b6..59c995de87b 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -6498,20 +6498,22 @@ def explain_plan(self, verbose=False) -> str: return self._scanner.explain_plan(verbose=verbose) - def analyze_plan(self) -> str: + def analyze_plan(self, count_rows: bool = False) -> str: """Execute the plan for this scanner and display with runtime metrics. Parameters ---------- - verbose : bool, default False - Use a verbose output format. + count_rows : bool, default False + If True, auto-apply a ``COUNT(*)`` aggregate before analyzing so + the returned plan reflects what :py:meth:`count_rows` would + execute (including the optimizer's count-pushdown decisions). Returns ------- plan : str """ - return self._scanner.analyze_plan() + return self._scanner.analyze_plan(count_rows=count_rows) class DatasetOptimizer: diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index f3bc9a681b2..748698d169b 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -475,7 +475,7 @@ class _Scanner: @property def schema(self) -> pa.Schema: ... def explain_plan(self, verbose: bool) -> str: ... - def analyze_plan(self) -> str: ... + def analyze_plan(self, count_rows: bool = False) -> str: ... def count_rows(self) -> int: ... def to_pyarrow(self) -> pa.RecordBatchReader: ... diff --git a/python/python/tests/test_count_pushdown.py b/python/python/tests/test_count_pushdown.py new file mode 100644 index 00000000000..896bd629fc9 --- /dev/null +++ b/python/python/tests/test_count_pushdown.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""End-to-end tests for count-from-mask pushdown. + +The optimizer rule under test (`CountPushdown`) rewrites +``SELECT COUNT(*) ... WHERE indexed_col v`` into +``AggregateExec(Final) → CountFromMaskExec → ScalarIndexExec`` when the +index covers every dataset fragment, or splits into a Union of a pushdown +branch over the indexed fragments and a scan branch over the rest when +coverage is partial. This is category 1 (count-from-mask) of the four +aggregate-acceleration categories; the other three (mask-to-answer, +zone-aware, dimension-keyed) are not implemented yet. + +Each test exercises a different state of the dataset (clean, with deletions, +with updates that introduce unindexed fragments, with a fully-deleted indexed +fragment) and asserts: + + 1. The returned count matches the ground truth (correctness), and + 2. The plan routes through ``CountFromMaskExec`` (the rule fired). + +For the cases where the index covers the whole dataset, the tests also assert +no ``LanceRead`` is present in the plan — proof that the count is being +answered from index metadata, not by scanning column data. The happy-path +test additionally re-runs the query and asserts the second call performs no +I/O. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import lance +import pyarrow as pa + +if TYPE_CHECKING: + from pathlib import Path + + +# -------------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------------- + +# 4 fragments × 25 rows = 100 rows; values 0..99 in `x`. +NUM_FRAGMENTS = 4 +ROWS_PER_FRAGMENT = 25 +NUM_ROWS = NUM_FRAGMENTS * ROWS_PER_FRAGMENT # 100 + + +def _make_dataset(tmp_path: Path) -> lance.LanceDataset: + """Build a 4-fragment dataset with a BTREE index on `x`.""" + table = pa.table({"x": pa.array(range(NUM_ROWS), pa.int64())}) + dataset = lance.write_dataset( + table, + tmp_path / "ds", + max_rows_per_file=ROWS_PER_FRAGMENT, + ) + assert len(dataset.get_fragments()) == NUM_FRAGMENTS + dataset.create_scalar_index("x", "BTREE") + return dataset + + +def _filtered_count_plan(dataset: lance.LanceDataset, filter: str) -> str: + """Return the ``analyze_plan(count_rows=True)`` output for a filtered + ``COUNT(*)`` — the same plan ``count_rows(filter=…)`` actually executes.""" + return dataset.scanner(columns=[], with_row_id=True, filter=filter).analyze_plan( + count_rows=True + ) + + +def _assert_pushdown_fired(plan: str) -> None: + assert "CountFromMask" in plan, f"expected CountFromMaskExec in plan, got:\n{plan}" + + +def _assert_no_column_scan(plan: str) -> None: + """Stricter: no LanceRead anywhere. Only applies when the index covers + every dataset fragment (no partial-coverage split branch).""" + assert "LanceRead" not in plan, ( + f"unexpected LanceRead in plan — column data was scanned:\n{plan}" + ) + + +# -------------------------------------------------------------------------- +# Tests +# -------------------------------------------------------------------------- + + +def test_filtered_count_with_scalar_index(tmp_path: Path): + """Happy path: filtered count on an indexed column, run twice. + + The second call must perform zero I/O — proof the rule routed the count + through the index/deletion-mask metadata both times and the second call + re-used the cache. The check uses ``dataset.io_stats_incremental()`` + rather than parsing the plan's ``bytes_read=…`` so we get a direct + accounting of every object-store read the dataset performed during the + second call, not just what the plan happens to surface. + """ + dataset = _make_dataset(tmp_path) + filter = "x < 50" + expected = 50 + + # Verify the rule fires for this shape. + _assert_pushdown_fired(_filtered_count_plan(dataset, filter)) + _assert_no_column_scan(_filtered_count_plan(dataset, filter)) + + # First call warms the index + deletion-mask caches. + assert dataset.count_rows(filter=filter) == expected + # Reset counters so the next snapshot only reflects the second call. + dataset.io_stats_incremental() + + # Second call: must do zero I/O. + assert dataset.count_rows(filter=filter) == expected + stats = dataset.io_stats_incremental() + assert stats.read_iops == 0, f"expected 0 read_iops, got {stats.read_iops}" + assert stats.read_bytes == 0, f"expected 0 read_bytes, got {stats.read_bytes}" + + +def test_filtered_count_with_deleted_rows(tmp_path: Path): + """Some matching rows are deleted — the count must reflect the deletions. + + Deletions don't change fragment coverage, so the index still covers every + dataset fragment and the rule emits a single pushdown branch (no scan). + """ + dataset = _make_dataset(tmp_path) + # Delete three rows that match the filter (x < 50). + dataset.delete("x = 10 OR x = 20 OR x = 30") + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + _assert_no_column_scan(plan) + assert dataset.count_rows(filter="x < 50") == 50 - 3 + + +def test_filtered_count_with_updated_rows(tmp_path: Path): + """Updates move rows in/out of the filter set. + + Before: x < 50 ⇒ 50 rows match (values 0..49). + After: + - x = 5 → x = 100 (one row leaves the matched set) + - x = 7 → x = 101 (another row leaves) + - x = 60 → x = 8 (a row joins the matched set) + - x = 70 → x = 9 (another joins) + + Net change: −2 + 2 = 0, so the final count is still 50, but the + underlying row identities have shifted. Each update is materialized as + a delete + insert into a new fragment in Lance — the new fragments are + not in the index's coverage, so the optimizer rule emits a split plan: + pushdown for the originally-indexed fragments, plus a scan branch for + the rewritten fragments. The final count must still be correct. + """ + dataset = _make_dataset(tmp_path) + dataset.update({"x": "100"}, where="x = 5") + dataset.update({"x": "101"}, where="x = 7") + dataset.update({"x": "8"}, where="x = 60") + dataset.update({"x": "9"}, where="x = 70") + + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + # 50 originally matching − 2 that left + 2 that joined = 50. + assert dataset.count_rows(filter="x < 50") == 50 + + +def test_filtered_count_with_whole_fragment_deleted(tmp_path: Path): + """Delete every row in one indexed fragment. + + Fragment 0 covers x ∈ [0, 25). Deleting all of those rows removes 25 + matches of `x < 50`, dropping the count from 50 to 25. + + Lance retires the now-empty fragment, so the dataset has 3 fragments + while the index still claims 4 — the index is a strict *superset* of + the dataset, which is safe (the extra index entries simply don't + apply). The rule emits a single pushdown branch (no scan needed). + """ + dataset = _make_dataset(tmp_path) + dataset.delete("x < 25") + plan = _filtered_count_plan(dataset, "x < 50") + _assert_pushdown_fired(plan) + _assert_no_column_scan(plan) + assert dataset.count_rows(filter="x < 50") == 25 diff --git a/python/src/scanner.rs b/python/src/scanner.rs index 691f7f53294..bbf1b3f35a3 100644 --- a/python/src/scanner.rs +++ b/python/src/scanner.rs @@ -125,14 +125,17 @@ impl Scanner { Ok(res) } - #[pyo3(signature = (*))] - fn analyze_plan(self_: PyRef<'_, Self>) -> PyResult { + #[pyo3(signature = (*, count_rows = false))] + fn analyze_plan(self_: PyRef<'_, Self>, count_rows: bool) -> PyResult { let scanner = self_.scanner.clone(); let res = rt() - .spawn( - Some(self_.py()), - async move { scanner.analyze_plan().await }, - )? + .spawn(Some(self_.py()), async move { + if count_rows { + scanner.analyze_count_plan().await + } else { + scanner.analyze_plan().await + } + })? .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(res) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 187d5be999f..38a29e9c43c 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -22,6 +22,7 @@ use super::{GeoQuery, RelationQuery}; use lance_core::{Error, Result}; use lance_datafusion::{expr::safe_coerce_scalar, planner::Planner}; use lance_select::{IndexExprResult, NullableIndexExprResult, NullableRowAddrMask}; +use roaring::RoaringBitmap; use tracing::instrument; const MAX_DEPTH: usize = 500; @@ -436,6 +437,7 @@ impl ScalarQueryParser for SargableQueryParser { index_type: self.index_type.clone(), query: Arc::new(query), needs_recheck: self.needs_recheck, + fragment_bitmap: None, })); // If the pattern has wildcards beyond simple prefix, add refine expression @@ -1074,7 +1076,8 @@ impl IndexedExpression { index_name, index_type, query, - needs_recheck: false, // Default to false, will be set by parser + needs_recheck: false, // Default to false, will be set by parser + fragment_bitmap: None, // Filled in by `apply_scalar_indices` })), refine_expr: None, } @@ -1095,6 +1098,7 @@ impl IndexedExpression { index_type, query, needs_recheck, + fragment_bitmap: None, // Filled in by `apply_scalar_indices` })), refine_expr: None, } @@ -1236,10 +1240,21 @@ pub struct ScalarIndexSearch { pub query: Arc, /// If true, the query results are inexact and will need a recheck pub needs_recheck: bool, + /// The fragments the underlying index has entries for. + /// + /// `None` means coverage is unknown (e.g. constructed outside of scanner + /// planning, or from a legacy code path). Optimizer rules that need to + /// decide whether the index covers the dataset must treat `None` as + /// "refuse to use" — the bitmap is the only way to safely answer that + /// question synchronously without an async metadata load. + pub fragment_bitmap: Option, } impl PartialEq for ScalarIndexSearch { fn eq(&self, other: &Self) -> bool { + // `fragment_bitmap` is metadata derived from the dataset state, not + // part of the query identity, so it intentionally does not participate + // in equality. self.column == other.column && self.index_name == other.index_name && self.query.as_ref().eq(other.query.as_ref()) @@ -1819,6 +1834,16 @@ pub trait IndexInformationProvider { /// Check if an index exists for `col` and, if so, return the data type of col /// as well as a query parser that can parse queries for that column fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)>; + + /// The set of fragments covered by `(column, index_name)`. + /// + /// Returns `None` when the provider doesn't know — callers must treat + /// that as "coverage unknown" rather than "covers everything". The + /// default implementation always returns `None`, so providers that + /// haven't been updated cannot accidentally claim full coverage. + fn fragment_bitmap(&self, _column: &str, _index_name: &str) -> Option { + None + } } /// Attempt to split a filter expression into a search of scalar indexes and an @@ -1827,7 +1852,31 @@ pub fn apply_scalar_indices( expr: Expr, index_info: &dyn IndexInformationProvider, ) -> Result { - Ok(visit_node(&expr, index_info, 0)?.unwrap_or(IndexedExpression::refine_only(expr))) + let mut result = + visit_node(&expr, index_info, 0)?.unwrap_or(IndexedExpression::refine_only(expr)); + if let Some(query) = result.scalar_query.as_mut() { + populate_fragment_bitmaps(query, index_info); + } + Ok(result) +} + +/// Walk a [`ScalarIndexExpr`] and fill in `fragment_bitmap` on each leaf from +/// the `index_info` provider. Leaves the bitmap as `None` if the provider +/// can't answer. +fn populate_fragment_bitmaps( + expr: &mut ScalarIndexExpr, + index_info: &dyn IndexInformationProvider, +) { + match expr { + ScalarIndexExpr::Not(inner) => populate_fragment_bitmaps(inner, index_info), + ScalarIndexExpr::And(lhs, rhs) | ScalarIndexExpr::Or(lhs, rhs) => { + populate_fragment_bitmaps(lhs, index_info); + populate_fragment_bitmaps(rhs, index_info); + } + ScalarIndexExpr::Query(search) => { + search.fragment_bitmap = index_info.fragment_bitmap(&search.column, &search.index_name); + } + } } #[derive(Clone, Default, Debug)] @@ -2422,6 +2471,7 @@ mod tests { index_type: "BTree".to_string(), query: Arc::new(SargableQuery::Equals(ScalarValue::UInt32(Some(10)))), needs_recheck: false, + fragment_bitmap: None, })); let right = Box::new(ScalarIndexExpr::Query(ScalarIndexSearch { column: "color".to_string(), @@ -2431,6 +2481,7 @@ mod tests { "blue".to_string(), )))), needs_recheck: false, + fragment_bitmap: None, })); check( &index_info, diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index 86c1204e174..6431909c9ce 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -239,12 +239,14 @@ impl JsonQueryParser { index_type, query, needs_recheck, + fragment_bitmap, }) => ScalarIndexExpr::Query(ScalarIndexSearch { column, index_name, index_type, query: Arc::new(JsonQuery::new(query, self.path.clone())), needs_recheck, + fragment_bitmap, }), // This code path should only be hit on leaf expr _ => unreachable!(), diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 9a5cd94dd09..3bdeee28f05 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4828,6 +4828,26 @@ impl Scanner { Ok(format!("{}", display.indent(verbose))) } + + /// Run [`Self::count_rows`]'s underlying plan and return it formatted with + /// runtime metrics. Equivalent to [`Self::analyze_plan`] but with a + /// `COUNT(*)` aggregate auto-applied first — the only way for callers + /// without a hand-built `AggregateExpr` (e.g. the Python bindings) to + /// inspect the plan that `count_rows` actually executed. + #[instrument(level = "info", skip(self))] + pub async fn analyze_count_plan(&self) -> Result { + let mut scanner = self.clone(); + scanner.aggregate(AggregateExpr::builder().count_star().build())?; + let plan = scanner.create_plan().await?; + analyze_plan( + plan, + LanceExecutionOptions { + batch_size: self.batch_size, + ..Default::default() + }, + ) + .await + } } // Search over all indexed fields including nested ones, collecting columns that have an diff --git a/rust/lance/src/dataset/tests/dataset_aggregate.rs b/rust/lance/src/dataset/tests/dataset_aggregate.rs index ef2a90e6315..35bf0ac1f29 100644 --- a/rust/lance/src/dataset/tests/dataset_aggregate.rs +++ b/rust/lance/src/dataset/tests/dataset_aggregate.rs @@ -44,6 +44,7 @@ use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount, assert_pla use lance_arrow::FixedSizeListArrayExt; use lance_index::IndexType; use lance_index::scalar::FullTextSearchQuery; +use lance_index::scalar::ScalarIndexParams; use lance_index::scalar::inverted::InvertedIndexParams; use lance_linalg::distance::MetricType; @@ -268,7 +269,9 @@ async fn test_count_star_single_fragment() { vec![], ); - // Verify COUNT(*) has empty projection optimization + // COUNT(*) is rewritten by CountPushdown into a Final aggregate + // over CountFromMaskExec, which answers from manifest metadata + the + // deletion mask instead of scanning column data. let mut scanner = ds.scan(); scanner .aggregate(AggregateExpr::substrait(agg_bytes.clone())) @@ -276,8 +279,8 @@ async fn test_count_star_single_fragment() { let plan = scanner.create_plan().await.unwrap(); assert_plan_node_equals( plan, - "AggregateExec: mode=Single, gby=[], aggr=[count(...)] - LanceRead: uri=..., projection=[], num_fragments=1, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=--, refine_filter=--", + "AggregateExec: mode=Final, gby=[], aggr=[count(...)] + CountFromMask", ) .await .unwrap(); @@ -1204,11 +1207,12 @@ async fn test_scanner_count_rows() { .unwrap(); let plan = scanner.create_plan().await.unwrap(); - // COUNT(*) should have empty projection (optimized to not read any columns) + // COUNT(*) is rewritten by CountPushdown into a Final aggregate + // over CountFromMaskExec. assert_plan_node_equals( plan.clone(), - "AggregateExec: mode=Single, gby=[], aggr=[count(Int32(1))] - LanceRead: uri=..., projection=[], num_fragments=2, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=--, refine_filter=--", + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask", ) .await .unwrap(); @@ -1255,6 +1259,122 @@ async fn test_scanner_count_rows_with_filter() { ); } +#[tokio::test] +async fn test_scanner_count_rows_with_indexed_filter() { + // When the filter is fully evaluable by a scalar index that covers + // every dataset fragment, the rule rewrites COUNT(*) into a Final + // aggregate over CountFromMaskExec, with the ScalarIndexExec + // wired in as the prefilter — no LanceRead, no column scan. + let mut ds = create_numeric_dataset("memory://test_count_indexed", 2, 50).await; + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let mut scanner = ds.scan(); + scanner.filter("x < 50").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask + ScalarIndexQuery: query=[x < 50]@x_idx(BTree)", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 50, + ); +} + +#[tokio::test] +async fn test_scanner_count_rows_with_partial_index_coverage() { + // Index covers the first two fragments, then a third fragment is + // appended. The rule cannot answer the count from the index alone for + // the appended fragment, so it emits a split plan: CountFromMaskExec + // over the indexed fragments + AggregateExec(Partial)/FilteredReadExec + // over the rest, both unioned and summed by AggregateExec(Final). + let tmp = tempdir().unwrap(); + let uri = tmp.path().to_str().unwrap(); + let mut ds = create_numeric_dataset(uri, 2, 50).await; + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Append a third fragment that the index does not cover. + let reader = gen_batch() + .col("x", array::step_custom::(100, 1)) + .col("y", array::step_custom::(0, 2)) + .col("category", array::cycle::(vec![1, 2, 3])) + .into_reader_rows( + lance_datagen::RowCount::from(50), + lance_datagen::BatchCount::from(1), + ); + let ds = Dataset::write( + reader, + uri, + Some(crate::dataset::WriteParams { + mode: crate::dataset::WriteMode::Append, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(ds.get_fragments().len(), 3); + + let mut scanner = ds.scan(); + // `x < 1000` matches every row (values are 0..100 + 100..150). The + // pushdown branch contributes the first 100 from the indexed fragments; + // the scan branch contributes the 50 rows from the appended fragment. + scanner.filter("x < 1000").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CoalescePartitionsExec + UnionExec + CountFromMask + ScalarIndexQuery: query=[x < 1000]@x_idx(BTree) + AggregateExec: mode=Partial, gby=[], aggr=[count(Int32(1))] + LanceRead: uri=..., projection=[], num_fragments=1, range_before=None, range_after=None, row_id=false, row_addr=true, full_filter=x < Int64(1000), refine_filter=--", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 150, + ); +} + #[tokio::test] async fn test_scanner_count_rows_empty_result() { let ds = create_numeric_dataset("memory://test_count_rows_empty", 1, 100).await; diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 9b3250f2ca8..20a63263222 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -592,9 +592,37 @@ pub(crate) async fn remap_index( })) } +/// Snapshot of every scalar index on a dataset, captured at planning time +/// and consumed by the scalar/aggregate pushdown machinery. +/// +/// Built once per planner invocation by walking the manifest's `IndexMetadata` +/// entries; thereafter all lookups are synchronous, so optimizer rules and the +/// filter parser can interrogate it without needing an async context. #[derive(Debug)] pub struct ScalarIndexInfo { + /// Per-column dispatch table for [`apply_scalar_indices`]: keyed by the + /// full dotted field path (e.g. `"x"`, `"metadata.status.code"`), the same + /// string callers use when referring to columns in filter expressions. + /// + /// The value pairs the column's data type with a [`MultiQueryParser`] + /// that fans out to every per-index parser registered for that column. + /// When a column carries more than one index (e.g. BTree + bitmap), the + /// `MultiQueryParser` tries each in order and the first match wins; the + /// resulting [`crate::scalar::expression::ScalarIndexSearch`] records + /// which specific index was chosen. So *which* index served the query is + /// an output of parsing, not an input — that's why this map is keyed only + /// by column. + /// + /// `fragment_bitmaps`, by contrast, *is* keyed by `(column, index_name)`, + /// because by the time the optimizer needs the bitmap the index name is + /// already pinned in the parsed leaf. indexed_columns: HashMap)>, + /// `(column, index_name) → fragment_bitmap` taken straight off each + /// [`IndexMetadata`] at construction time. Used by the optimizer rule for + /// aggregate pushdown to reason about index coverage synchronously. + /// Indices that omit `fragment_bitmap` (legacy or unsupported) simply + /// don't appear here and so report coverage as unknown. + fragment_bitmaps: HashMap<(String, String), RoaringBitmap>, } impl IndexInformationProvider for ScalarIndexInfo { @@ -603,6 +631,12 @@ impl IndexInformationProvider for ScalarIndexInfo { .get(col) .map(|(ty, parser)| (ty, parser.as_ref() as &dyn ScalarQueryParser)) } + + fn fragment_bitmap(&self, column: &str, index_name: &str) -> Option { + self.fragment_bitmaps + .get(&(column.to_string(), index_name.to_string())) + .cloned() + } } async fn open_index_proto(reader: &dyn Reader) -> Result { @@ -2187,6 +2221,12 @@ impl DatasetIndexInternalExt for Dataset { let indices = self.load_indices().await?; let schema = self.schema(); let mut indexed_fields = Vec::new(); + // (column, index_name) → union of every contributing IndexMetadata's + // fragment_bitmap. Multiple entries can land here for delta-merged + // indices that share a name. We only insert when every contributing + // entry has a bitmap; if any are missing, we leave the entry absent + // so the optimizer treats coverage as unknown. + let mut fragment_bitmaps: HashMap<(String, String), Option> = HashMap::new(); for index in indices.iter().filter(|idx| { let idx_schema = schema.project_by_ids(idx.fields.as_slice(), true); let is_vector_index = idx_schema @@ -2245,6 +2285,23 @@ impl DatasetIndexInternalExt for Dataset { let query_parser = plugin.new_query_parser(index.name.clone(), &index_details.0); if let Some(query_parser) = query_parser { + // Union the per-segment fragment bitmap into this + // (column, index_name) entry. If any segment is missing a + // bitmap, downgrade the entry to None so callers know + // coverage is partial/unknown. + let key = (field_path.clone(), index.name.clone()); + fragment_bitmaps + .entry(key) + .and_modify(|entry| { + if let (Some(acc), Some(seg)) = + (entry.as_mut(), index.fragment_bitmap.as_ref()) + { + *acc |= seg; + } else { + *entry = None; + } + }) + .or_insert_with(|| index.fragment_bitmap.clone()); indexed_fields.push((field_path, (field.data_type(), query_parser))); } } @@ -2269,8 +2326,14 @@ impl DatasetIndexInternalExt for Dataset { ) }); } + // Drop entries we couldn't pin to a known bitmap. + let fragment_bitmaps = fragment_bitmaps + .into_iter() + .filter_map(|(k, v)| v.map(|bm| (k, bm))) + .collect(); Ok(ScalarIndexInfo { indexed_columns: index_info_map, + fragment_bitmaps, }) } diff --git a/rust/lance/src/io/exec.rs b/rust/lance/src/io/exec.rs index f06f575de68..a477d60d56d 100644 --- a/rust/lance/src/io/exec.rs +++ b/rust/lance/src/io/exec.rs @@ -7,6 +7,8 @@ #[cfg(feature = "substrait")] pub mod ann_proto; +pub mod count_from_mask; +pub mod count_pushdown; mod filter; pub mod filtered_read; #[cfg(feature = "substrait")] diff --git a/rust/lance/src/io/exec/count_from_mask.rs b/rust/lance/src/io/exec/count_from_mask.rs new file mode 100644 index 00000000000..df0478ce208 --- /dev/null +++ b/rust/lance/src/io/exec/count_from_mask.rs @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Execute-time half of the count-from-mask category of aggregate pushdown. +//! +//! [`CountFromMaskExec`] computes a `COUNT(*)`-style aggregate's partial +//! state directly from index/manifest metadata, without scanning column +//! data. Conceptually: +//! +//! ```text +//! result = | fragments_allow ∩ optional_prefilter_mask − deletion_mask | +//! ``` +//! +//! Its output schema matches what `AggregateExec(AggregateMode::Partial)` +//! would produce for the same `COUNT` aggregates, so a downstream +//! `AggregateExec(Final)` can combine the result unchanged. +//! +//! This is one of four categories of aggregate acceleration we plan to +//! support; the others (mask-to-answer, zone-aware, dimension-keyed) each +//! need additional plumbing — see the corresponding design issue. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Array, BinaryArray, Int64Array, RecordBatch}; +use arrow_schema::{Schema, SchemaRef}; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, + execution_plan::{Boundedness, EmissionType}, + metrics::{ExecutionPlanMetricsSet, MetricsSet}, +}; +use datafusion_physical_expr::EquivalenceProperties; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; +use futures::{StreamExt, TryStreamExt}; +use lance_core::{Error, Result}; +use lance_select::{RowAddrMask, RowAddrSelection, RowAddrTreeMap}; +use lance_table::format::Fragment; +use roaring::RoaringBitmap; +use tracing::instrument; + +use super::utils::InstrumentedRecordBatchStreamAdapter; +use crate::Dataset; +use crate::index::prefilter::DatasetPreFilter; + +/// An execution node that computes a `COUNT(*)`-style aggregate from an +/// optional row-address mask supplied by an upstream scalar-index search, +/// combined with the dataset's deletion mask and an optional restriction to +/// a fragment subset. +/// +/// The node returns one record batch with one row whose columns are the +/// partial-state representation of each `COUNT` in `aggregate_funcs` — i.e. +/// the same shape an `AggregateExec(Partial)` would emit. +#[derive(Debug)] +pub struct CountFromMaskExec { + dataset: Arc, + /// One per output column. Used only for `state_fields()` to build the + /// output schema; the actual count is computed identically for all of + /// them since every entry is a non-distinct `COUNT()`. + aggregate_funcs: Vec>, + /// Optional [`super::scalar_index::ScalarIndexExec`] producing the row- + /// address mask to count. + prefilter_input: Option>, + /// Restrict the count to this fragment subset. `None` means "every + /// fragment in the dataset." The optimizer rule uses this to scope the + /// pushdown branch of a partial-coverage split plan to the indexed + /// fragments only — the uncovered fragments are handled by a parallel + /// scan branch. + restrict_to_fragments: Option, + schema: SchemaRef, + properties: Arc, + metrics: ExecutionPlanMetricsSet, +} + +impl DisplayAs for CountFromMaskExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "CountFromMask") + } + DisplayFormatType::TreeRender => write!(f, "CountFromMask"), + } + } +} + +impl CountFromMaskExec { + /// Build a new node. + /// + /// `aggregate_funcs` must be a non-empty set of non-distinct `COUNT` + /// aggregates (the optimizer rule guarantees this). `prefilter_input`, + /// if present, must produce a single batch in the scalar-index result + /// schema; that mask is intersected with the dataset's covered + /// fragments and the active deletion mask. + pub fn try_new( + dataset: Arc, + aggregate_funcs: Vec>, + prefilter_input: Option>, + ) -> Result { + Self::try_new_restricted(dataset, aggregate_funcs, prefilter_input, None) + } + + /// Like [`Self::try_new`] but scopes the count to a fragment subset + /// rather than the whole dataset. The optimizer rule uses this for the + /// pushdown branch of a partial-coverage split plan, so the count only + /// covers the fragments the prefilter's index can answer for. + pub fn try_new_restricted( + dataset: Arc, + aggregate_funcs: Vec>, + prefilter_input: Option>, + restrict_to_fragments: Option, + ) -> Result { + if aggregate_funcs.is_empty() { + return Err(Error::invalid_input( + "CountFromMaskExec requires at least one aggregate".to_string(), + )); + } + + let state_fields = aggregate_funcs + .iter() + .map(|agg| agg.state_fields()) + .collect::>>() + .map_err(|e| Error::invalid_input(e.to_string()))? + .into_iter() + .flatten() + .collect::>(); + let state_fields_owned: Vec = + state_fields.iter().map(|f| f.as_ref().clone()).collect(); + let schema: SchemaRef = Arc::new(Schema::new(state_fields_owned)); + + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::RoundRobinBatch(1), + EmissionType::Incremental, + Boundedness::Bounded, + )); + + Ok(Self { + dataset, + aggregate_funcs, + prefilter_input, + restrict_to_fragments, + schema, + properties, + metrics: ExecutionPlanMetricsSet::new(), + }) + } + + /// Drain `prefilter_input` (a [`super::scalar_index::ScalarIndexExec`]) + /// to produce the row-address mask it serialized. + async fn load_prefilter( + prefilter_input: Arc, + context: Arc, + ) -> Result { + let mut stream = prefilter_input.execute(0, context).map_err(Error::from)?; + let batch = stream + .try_next() + .await + .map_err(Error::from)? + .ok_or_else(|| { + Error::internal( + "CountFromMaskExec: prefilter input produced no batches".to_string(), + ) + })?; + // Drain any remaining batches so the upstream sees a clean shutdown. + while stream.try_next().await.map_err(Error::from)?.is_some() {} + + let result_col = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: prefilter result column has type {:?}, expected Binary", + batch.column(0).data_type() + )) + })?; + RowAddrMask::from_arrow(result_col) + } + + /// Fold the prefilter, fragment allow list, and deletion mask into a + /// single `AllowList`-shaped [`RowAddrMask`] suitable for counting. + fn combine_masks( + fragments_allow: RowAddrTreeMap, + prefilter: Option, + deletion_mask: Option>, + ) -> RowAddrMask { + let base = RowAddrMask::AllowList(fragments_allow); + let after_prefilter = match prefilter { + None => base, + Some(prefilter) => base & prefilter, + }; + match deletion_mask { + None => after_prefilter, + Some(deletion_mask) => after_prefilter & (*deletion_mask).clone(), + } + } + + /// Count the rows selected by `mask`, looking up `Full`-marker fragments + /// in the manifest so we never need to materialize a + /// `RoaringBitmap::full()`. + fn count_from_mask(mask: &RowAddrMask, dataset: &Dataset) -> Result { + let allow = mask.allow_list().ok_or_else(|| { + Error::internal("CountFromMaskExec: combined mask is not an AllowList".to_string()) + })?; + let frag_map: HashMap = dataset + .fragments() + .iter() + .map(|f| (f.id as u32, f)) + .collect(); + let mut count = 0i64; + for (frag_id, sel) in allow.iter() { + match sel { + RowAddrSelection::Full => { + // The fragment is in the allow list with no deletions + // touching it — its row count is the physical row count. + let frag = frag_map.get(frag_id).ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: fragment {} not found in manifest", + frag_id + )) + })?; + let n = frag.physical_rows.ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: physical_rows missing for fragment {}", + frag_id + )) + })?; + count += n as i64; + } + RowAddrSelection::Partial(bitmap) => { + count += bitmap.len() as i64; + } + } + } + Ok(count) + } + + #[instrument(name = "count_from_mask", skip_all, level = "debug")] + async fn do_execute( + dataset: Arc, + aggregate_funcs_len: usize, + prefilter_input: Option>, + restrict_to_fragments: Option, + context: Arc, + schema: SchemaRef, + ) -> Result { + let prefilter = match prefilter_input { + None => None, + Some(input) => Some(Self::load_prefilter(input, context.clone()).await?), + }; + + // Anchor the deletion mask against either every dataset fragment or + // the caller-supplied restricted subset. + let dataset_fragments: RoaringBitmap = + dataset.fragments().iter().map(|f| f.id as u32).collect(); + let fragments_covered = match restrict_to_fragments { + Some(restrict) => dataset_fragments & restrict, + None => dataset_fragments, + }; + + // Build the fragments allow list as concrete `[0..physical_rows)` + // ranges rather than `Full` markers. `Full` interacts poorly with + // `BlockList` subtraction — `RowAddrTreeMap::Sub` materializes a + // `RoaringBitmap::full()` (2^32 rows) per fragment when a `Full` + // entry gets a partial block subtracted from it, which inflates + // counts and is expensive. Concrete ranges avoid that path entirely + // and keep `len()` exact at every combine step. + let frag_map: HashMap = dataset + .fragments() + .iter() + .map(|f| (f.id as u32, f)) + .collect(); + let mut fragments_allow = RowAddrTreeMap::new(); + for frag_id in fragments_covered.iter() { + let frag = frag_map.get(&frag_id).ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: fragment {} not in manifest", + frag_id + )) + })?; + let physical = frag.physical_rows.ok_or_else(|| { + Error::internal(format!( + "CountFromMaskExec: physical_rows missing for fragment {}", + frag_id + )) + })?; + let mut bitmap = RoaringBitmap::new(); + bitmap.insert_range(0u32..(physical as u32)); + fragments_allow.insert_bitmap(frag_id, bitmap); + } + + // Load the deletion mask for the covered fragments. + let deletion_mask = + match DatasetPreFilter::create_deletion_mask(dataset.clone(), fragments_covered) { + Some(fut) => Some(fut.await?), + None => None, + }; + + let combined = Self::combine_masks(fragments_allow, prefilter, deletion_mask); + let count = Self::count_from_mask(&combined, dataset.as_ref())?; + + // Every aggregate is the same non-distinct COUNT shape — emit the + // count once per output column. + let arrays: Vec> = (0..aggregate_funcs_len) + .map(|_| Arc::new(Int64Array::from(vec![count])) as Arc) + .collect(); + Ok(RecordBatch::try_new(schema, arrays)?) + } +} + +impl ExecutionPlan for CountFromMaskExec { + fn name(&self) -> &str { + "CountFromMaskExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + fn children(&self) -> Vec<&Arc> { + match &self.prefilter_input { + Some(input) => vec![input], + None => vec![], + } + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion::error::Result> { + let prefilter_input = match children.len() { + 0 => None, + 1 => Some(children.into_iter().next().unwrap()), + n => { + return Err(datafusion::error::DataFusionError::Internal(format!( + "CountFromMaskExec accepts 0 or 1 children, got {}", + n + ))); + } + }; + Ok(Arc::new(Self { + dataset: self.dataset.clone(), + aggregate_funcs: self.aggregate_funcs.clone(), + prefilter_input, + restrict_to_fragments: self.restrict_to_fragments.clone(), + schema: self.schema.clone(), + properties: self.properties.clone(), + metrics: self.metrics.clone(), + })) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion::error::Result { + let schema = self.schema.clone(); + let batch_fut = Self::do_execute( + self.dataset.clone(), + self.aggregate_funcs.len(), + self.prefilter_input.clone(), + self.restrict_to_fragments.clone(), + context, + schema.clone(), + ); + let stream = futures::stream::iter(vec![batch_fut]) + .then(|fut| async move { fut.await.map_err(|err| err.into()) }) + .boxed(); + Ok(Box::pin(InstrumentedRecordBatchStreamAdapter::new( + schema, + stream, + partition, + &self.metrics, + ))) + } + + fn partition_statistics( + &self, + _partition: Option, + ) -> datafusion::error::Result { + Ok(datafusion::physical_plan::Statistics { + num_rows: datafusion::common::stats::Precision::Exact(1), + ..datafusion::physical_plan::Statistics::new_unknown(&self.schema) + }) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn supports_limit_pushdown(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use std::{ops::Bound, sync::Arc}; + + use arrow::datatypes::{Int64Type, UInt64Type}; + use datafusion::common::DFSchema; + use datafusion::execution::TaskContext; + use datafusion::functions_aggregate; + use datafusion::logical_expr::lit; + use datafusion::physical_expr::execution_props::ExecutionProps; + use datafusion::physical_plan::ExecutionPlan; + use datafusion::physical_planner::create_aggregate_expr_and_maybe_filter; + use datafusion::scalar::ScalarValue; + use futures::TryStreamExt; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::gen_batch; + use lance_index::IndexType; + use lance_index::scalar::{ + SargableQuery, ScalarIndexParams, + expression::{ScalarIndexExpr, ScalarIndexSearch}, + }; + use lance_select::result::IndexExprResultWireFormat; + use lance_select::{RowAddrMask, RowAddrTreeMap}; + + use super::*; + use crate::Dataset; + use crate::index::DatasetIndexExt; + use crate::io::exec::scalar_index::ScalarIndexExec; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + /// Build an `AggregateFunctionExpr` matching `COUNT(*)`. + fn count_star_expr(input_schema: &SchemaRef) -> Arc { + let expr = functions_aggregate::count::count(lit(1)); + let df_schema = DFSchema::try_from(input_schema.as_ref().clone()).unwrap(); + let (agg_expr, _filter, _order_by) = create_aggregate_expr_and_maybe_filter( + &expr, + &df_schema, + input_schema.as_ref(), + &ExecutionProps::default(), + ) + .unwrap(); + agg_expr + } + + struct Fixture { + dataset: Arc, + _tmp: TempStrDir, + } + + /// 4 fragments × 10 rows, ascending `ordered` column with a BTree index. + async fn make_fixture() -> Fixture { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Fixture { + dataset: Arc::new(dataset), + _tmp: tmp, + } + } + + fn input_schema() -> SchemaRef { + Arc::new(Schema::new(vec![arrow_schema::Field::new( + "ordered", + arrow_schema::DataType::UInt64, + false, + )])) + } + + async fn run(plan: CountFromMaskExec) -> i64 { + let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 1); + batches[0] + .column(0) + .as_any() + .downcast_ref::>() + .expect("count partial state should be Int64") + .value(0) + } + + #[tokio::test] + async fn try_new_rejects_empty_aggregate_funcs() { + let fixture = make_fixture().await; + let err = CountFromMaskExec::try_new(fixture.dataset, vec![], None).unwrap_err(); + assert!(err.to_string().contains("at least one aggregate"), "{err}"); + } + + #[tokio::test] + async fn count_from_mask_mixes_full_and_partial() { + // Synthesize an AllowList containing one Full-marker fragment and + // one Partial bitmap; verify the Full fragment falls back to + // physical_rows from the manifest and Partial falls back to + // bitmap.len(). + let fixture = make_fixture().await; + let mut tm = RowAddrTreeMap::new(); + // Fragment 0: full (10 physical rows). + tm.insert_fragment(0); + // Fragment 1: partial with explicit row addrs. + let row_addr_for = |frag_id: u32, offset: u32| ((frag_id as u64) << 32) | offset as u64; + tm.insert(row_addr_for(1, 0)); + tm.insert(row_addr_for(1, 1)); + tm.insert(row_addr_for(1, 2)); + + let mask = RowAddrMask::AllowList(tm); + let count = CountFromMaskExec::count_from_mask(&mask, fixture.dataset.as_ref()).unwrap(); + assert_eq!(count, 10 + 3); + } + + #[tokio::test] + async fn execute_count_no_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + None, + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 40); // 4 fragments × 10 rows + } + + #[tokio::test] + async fn execute_count_with_allow_list_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + + // `ordered < 25` matches 25 rows across the four fragments. + let prefilter_expr = ScalarIndexExpr::Query(ScalarIndexSearch { + column: "ordered".to_string(), + index_name: "ordered_idx".to_string(), + index_type: "BTree".to_string(), + query: Arc::new(SargableQuery::Range( + Bound::Unbounded, + Bound::Excluded(ScalarValue::UInt64(Some(25))), + )), + needs_recheck: false, + fragment_bitmap: None, + }); + let prefilter: Arc = Arc::new(ScalarIndexExec::new( + fixture.dataset.clone(), + prefilter_expr, + IndexExprResultWireFormat::default(), + )); + + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + Some(prefilter), + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 25); + } + + #[tokio::test] + async fn execute_count_with_block_list_prefilter() { + let fixture = make_fixture().await; + let schema = input_schema(); + + // NOT(ordered < 25) is a block list of those 25 rows — 40 − 25 = 15. + let prefilter_expr = + ScalarIndexExpr::Not(Box::new(ScalarIndexExpr::Query(ScalarIndexSearch { + column: "ordered".to_string(), + index_name: "ordered_idx".to_string(), + index_type: "BTree".to_string(), + query: Arc::new(SargableQuery::Range( + Bound::Unbounded, + Bound::Excluded(ScalarValue::UInt64(Some(25))), + )), + needs_recheck: false, + fragment_bitmap: None, + }))); + let prefilter: Arc = Arc::new(ScalarIndexExec::new( + fixture.dataset.clone(), + prefilter_expr, + IndexExprResultWireFormat::default(), + )); + + let plan = CountFromMaskExec::try_new( + fixture.dataset.clone(), + vec![count_star_expr(&schema)], + Some(prefilter), + ) + .unwrap(); + let count = run(plan).await; + assert_eq!(count, 15); + } + + #[tokio::test] + async fn execute_count_respects_deletions() { + let fixture = make_fixture().await; + let mut dataset = (*fixture.dataset).clone(); + // Delete the first ten rows of the dataset (which live in fragment 0). + dataset.delete("ordered < 10").await.unwrap(); + let dataset = Arc::new(dataset); + + let schema = input_schema(); + let plan = + CountFromMaskExec::try_new(dataset, vec![count_star_expr(&schema)], None).unwrap(); + let count = run(plan).await; + assert_eq!(count, 30); + } +} diff --git a/rust/lance/src/io/exec/count_pushdown.rs b/rust/lance/src/io/exec/count_pushdown.rs new file mode 100644 index 00000000000..3a3f442aa3e --- /dev/null +++ b/rust/lance/src/io/exec/count_pushdown.rs @@ -0,0 +1,798 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Physical optimizer rule that rewrites a `COUNT(*)`-style aggregate into +//! [`CountFromMaskExec`], answering the count from index metadata and the +//! deletion mask without scanning column data. +//! +//! This is the "count-from-mask" category of aggregate pushdown — one of +//! four planned. The other categories (mask-to-answer, zone-aware, +//! dimension-keyed) will each need their own rule and exec; the surrounding +//! infrastructure (the `fragment_bitmap` plumbed on each `ScalarIndexSearch`, +//! the `IndexInformationProvider::fragment_bitmap` lookup) is general +//! enough to be reused. +//! +//! Two rewritten shapes are emitted depending on whether the scalar index +//! backing the filter covers every dataset fragment. +//! +//! **Full coverage** (index ⊇ dataset, or no filter at all): +//! +//! ```text +//! AggregateExec(Final, aggs=[count(...)], group_by=[]) +//! └── CountFromMaskExec { prefilter_input = index_input } +//! ``` +//! +//! **Partial coverage** (index ⊊ dataset — typically appended fragments): +//! +//! ```text +//! AggregateExec(Final, aggs=[count(...)], group_by=[]) +//! └── UnionExec +//! ├── CountFromMaskExec(restrict_to_fragments = indexed) +//! └── AggregateExec(Partial) +//! └── FilteredReadExec(fragments = unindexed, full_filter = …) +//! ``` +//! +//! [`CountFromMaskExec`] emits partial-state, so the outer +//! `AggregateExec(Final)` performs the final combine in either shape. +//! +//! If the prefilter's index coverage is unknown (any leaf is missing +//! `fragment_bitmap`, e.g. constructed outside scanner planning), the rule +//! refuses to fire and leaves the existing scan path in place. + +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::config::ConfigOptions; +use datafusion::error::Result as DFResult; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +#[allow(deprecated)] +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::{ + ExecutionPlan, ExecutionPlanProperties, + aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}, + coalesce_partitions::CoalescePartitionsExec, + projection::ProjectionExec, + repartition::RepartitionExec, + union::UnionExec, +}; +use datafusion_physical_expr::aggregate::AggregateFunctionExpr; +use datafusion_physical_expr::expressions::{Column, Literal}; +use lance_index::scalar::expression::ScalarIndexExpr; +use log::warn; +use roaring::RoaringBitmap; + +use super::count_from_mask::CountFromMaskExec; +use super::filtered_read::{FilteredReadExec, FilteredReadOptions}; +use super::scalar_index::ScalarIndexExec; + +/// Physical optimizer rule that rewrites a `COUNT(*)`-style aggregate into +/// [`CountFromMaskExec`], optionally splitting into a parallel scan branch +/// when the index has partial coverage of the dataset. +/// +/// Only fires when the shape is verifiably safe; everything outside that +/// envelope (GROUP BY, residual filters, scan ranges, etc.) is left alone +/// for the normal scan path. +#[derive(Debug)] +pub struct CountPushdown; + +impl PhysicalOptimizerRule for CountPushdown { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> DFResult> { + Ok(plan + .transform_down(|plan| { + let Some(agg) = plan.as_any().downcast_ref::() else { + return Ok(Transformed::no(plan)); + }; + if let Some(rewritten) = try_rewrite(agg)? { + return Ok(Transformed::yes(rewritten)); + } + Ok(Transformed::no(plan)) + })? + .data) + } + + fn name(&self) -> &str { + "count_pushdown" + } + + fn schema_check(&self) -> bool { + true + } +} + +fn try_rewrite(agg: &AggregateExec) -> DFResult>> { + // We can accelerate Single (Lance scanner shape) and Partial (the shape + // DataFusion's SQL planner emits at the leaf of an aggregate pipeline); + // both produce results we know how to compute from the index. We will + // never accelerate Final or FinalPartitioned — those combine an existing + // partial stream, and the value of this rule is replacing the work that + // produces the partial stream. + let mode = match agg.mode() { + AggregateMode::Single => AggregateMode::Single, + AggregateMode::Partial => AggregateMode::Partial, + _ => return Ok(None), + }; + if !agg.group_expr().is_empty() { + return Ok(None); + } + if agg.aggr_expr().is_empty() { + return Ok(None); + } + + // Every aggregate must be a `COUNT()` shape (i.e. COUNT(*) / + // COUNT(1) / etc.) with no per-aggregate `FILTER (WHERE ...)`. This rule + // is scoped to the count-from-mask category only; other aggregate + // categories (mask-to-answer, zone-aware, dimension-keyed) will need + // their own rules with their own gates. + for (af, filter) in agg.aggr_expr().iter().zip(agg.filter_expr().iter()) { + if !is_count_star(af) { + return Ok(None); + } + if filter.is_some() { + return Ok(None); + } + } + + // The input must be a FilteredReadExec we can prove is safe to skip. + // DataFusion's SQL planner inserts a few row-preserving wrappers above + // the leaf — a `RepartitionExec` for parallelism, an empty + // `ProjectionExec` once the count expression has been resolved to need + // no columns, and `CoalesceBatchesExec` here and there. Walk through + // those to reach the FilteredReadExec. + let Some(filtered_read) = strip_row_preserving_wrappers(agg.children()[0]) else { + return Ok(None); + }; + + // Stable-row-id mode: `DatasetPreFilter::create_deletion_mask` produces + // an AllowList in stable-id space, but `CountFromMaskExec` builds its + // fragments-allow list in row-address space. ANDing across the two + // yields a silently wrong count (rows in fragments > 0 are dropped + // because their stable ids and row addresses share a fragment-id bucket + // only by accident). Until the exec can reconcile the two id spaces, + // refuse to fire — but warn so we notice the lost optimization + // opportunity. + if filtered_read.dataset().manifest().uses_stable_row_ids() { + warn!( + "count_pushdown: skipped because the dataset uses stable row ids; \ + the count will be computed via a full scan. Reconciling the two id spaces \ + would let this query be answered from index metadata." + ); + return Ok(None); + } + + let options = filtered_read.options(); + // A refine filter is a residual the index couldn't fully evaluate — it + // needs column data to apply, which we can't. + if options.refine_filter.is_some() { + return Ok(None); + } + // A full_filter without an index_input means the filter is evaluated by + // scanning every row; not pushdownable. + if options.full_filter.is_some() && filtered_read.index_input().is_none() { + return Ok(None); + } + // LIMIT/OFFSET would change the count. + if options.scan_range_before_filter.is_some() || options.scan_range_after_filter.is_some() { + return Ok(None); + } + // We rely on the deletion mask being applied; with_deleted_rows changes + // that contract. Surfacing as a warning because it shouldn't normally + // pair with an aggregate plan — if we see it, the planner produced a + // shape we could in principle accelerate but currently can't. + if options.with_deleted_rows { + warn!( + "count_pushdown: skipped because the FilteredReadExec was built \ + with with_deleted_rows; the count will be computed via a full \ + scan." + ); + return Ok(None); + } + // Same story for an explicit fragment subset: legitimate, but unexpected + // alongside an aggregate, and we lose the pushdown opportunity. + if options.fragments.is_some() { + warn!( + "count_pushdown: skipped because the FilteredReadExec was scoped \ + to an explicit fragment subset; the count will be computed via a \ + full scan. Intersecting that subset into the coverage logic would \ + let this query be answered from index metadata." + ); + return Ok(None); + } + + let dataset = filtered_read.dataset().clone(); + let dataset_fragments: RoaringBitmap = + dataset.fragments().iter().map(|f| f.id as u32).collect(); + let prefilter_input = filtered_read.index_input().cloned(); + + // If there is a prefilter, inspect its ScalarIndexExpr leaves: + // - Refuse to fire if any leaf is inexact (`needs_recheck`). The + // prefilter's serialized batch carries an Exact/AtMost/AtLeast + // discriminant but `load_prefilter` only reads the row-address mask + // and would treat AtMost as Exact, silently overcounting (and + // symmetrically AtLeast would undercount). + // - Compute the index's fragment coverage from leaf `fragment_bitmap`s. + // `None` means at least one leaf has no bitmap and we can't reason + // about coverage synchronously — refuse to fire. + let index_coverage = match &prefilter_input { + None => None, + Some(input) => { + let scalar_exec = input + .as_any() + .downcast_ref::() + .ok_or_else(|| { + datafusion::error::DataFusionError::Internal( + "count_pushdown: FilteredReadExec.index_input is not a ScalarIndexExec" + .to_string(), + ) + })?; + if scalar_exec.expr().needs_recheck() { + return Ok(None); + } + let Some(coverage) = collect_coverage(scalar_exec.expr()) else { + return Ok(None); + }; + Some(coverage) + } + }; + + let aggr_exprs: Vec> = agg.aggr_expr().to_vec(); + + // Decide on the plan shape. Three cases: + // + // 1. No prefilter (no filter at all): single pushdown branch over every + // dataset fragment. Always safe. + // 2. Prefilter + index covers every dataset fragment: single pushdown + // branch, prefilter feeds in directly. + // 3. Prefilter + index covers a strict subset: split into pushdown over + // indexed fragments + parallel scan over unindexed fragments. + let (partial_stream, partial_state_schema): (Arc, _) = match index_coverage { + None => { + // No prefilter at all (verified above): nothing to restrict. + let exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + None, + )?; + let schema = exec.schema(); + (Arc::new(exec), schema) + } + Some(coverage) if (&dataset_fragments - &coverage).is_empty() => { + // Prefilter exists and the index covers every dataset fragment — + // safe to push the whole count down. + let exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + None, + )?; + let schema = exec.schema(); + (Arc::new(exec), schema) + } + Some(coverage) => { + // Split plan: CountFromMaskExec for the indexed fragments, a + // normal scan + AggregateExec(Partial) for the rest. + let uncovered = &dataset_fragments - &coverage; + let pushdown_exec = CountFromMaskExec::try_new_restricted( + dataset, + aggr_exprs.clone(), + prefilter_input, + Some(&dataset_fragments & &coverage), + )?; + let partial_state_schema = pushdown_exec.schema(); + let pushdown_branch: Arc = Arc::new(pushdown_exec); + let scan_branch = + build_scan_branch(filtered_read, options, &uncovered, aggr_exprs.clone())?; + let union: Arc = + UnionExec::try_new(vec![pushdown_branch, scan_branch])?; + (union, partial_state_schema) + } + }; + + match mode { + AggregateMode::Partial => { + // Caller's parent is already an AggregateExec(Final) that knows + // how to consume multi-partition partial state — substitute our + // partial stream and we're done. + Ok(Some(partial_stream)) + } + AggregateMode::Single => { + // The original AggregateExec(Single) produced final output in one + // step; our exec emits partial state, so add a Final on top to + // recover the original output schema. Final expects a single + // partition of partial-state rows, so coalesce when we have a + // union producing multiple partitions. + let final_input: Arc = + if partial_stream.output_partitioning().partition_count() > 1 { + Arc::new(CoalescePartitionsExec::new(partial_stream)) + } else { + partial_stream + }; + // `AggregateExec::try_new` requires one + // `Option>` per aggregate expression for + // the optional per-aggregate `FILTER (WHERE ...)` clause. We + // rejected any aggregate carrying a filter back at the gate, so + // every slot is `None` here. + let filters: Vec>> = + (0..aggr_exprs.len()).map(|_| None).collect(); + let final_agg = AggregateExec::try_new( + AggregateMode::Final, + PhysicalGroupBy::default(), + aggr_exprs, + filters, + final_input, + partial_state_schema, + )?; + Ok(Some(Arc::new(final_agg))) + } + _ => unreachable!("mode was checked at the top of try_rewrite"), + } +} + +/// Build the scan branch of a partial-coverage split: a `FilteredReadExec` +/// restricted to the uncovered fragments (no `index_input`, the original +/// `full_filter` applied per row) wrapped in `AggregateExec(Partial)` so its +/// partial state can be unioned with the pushdown branch. +fn build_scan_branch( + filtered_read: &FilteredReadExec, + options: &FilteredReadOptions, + uncovered: &RoaringBitmap, + aggr_exprs: Vec>, +) -> DFResult> { + let dataset = filtered_read.dataset().clone(); + let uncovered_fragments: Vec<_> = dataset + .manifest() + .fragments + .iter() + .filter(|f| uncovered.contains(f.id as u32)) + .cloned() + .collect(); + let mut scan_options = options.clone(); + scan_options.fragments = Some(Arc::new(uncovered_fragments)); + let scan = FilteredReadExec::try_new(dataset, scan_options, None)?; + let scan: Arc = Arc::new(scan); + let scan_schema = scan.schema(); + // Per-aggregate `FILTER (WHERE ...)` placeholders — see the matching + // comment in `try_rewrite`; we've already rejected any aggregate that + // carried a filter, so every slot is `None`. + let filters: Vec>> = + (0..aggr_exprs.len()).map(|_| None).collect(); + let partial = AggregateExec::try_new( + AggregateMode::Partial, + PhysicalGroupBy::default(), + aggr_exprs, + filters, + scan, + scan_schema, + )?; + Ok(Arc::new(partial)) +} + +/// Walk through row-preserving wrappers (`RepartitionExec`, +/// `CoalesceBatchesExec`, and identity-or-empty `ProjectionExec`) that +/// DataFusion's planner inserts between an `AggregateExec` and the leaf, and +/// return the underlying `FilteredReadExec` if one is reached. +/// +/// "Row-preserving" here means the wrapper changes neither the number of rows +/// nor the predicate applied to them — it may reshape partitions, batches, or +/// drop unused columns, but the row population at the bottom is what reaches +/// the aggregate. That's all the rule needs from these layers, so it's safe to +/// look past them. +fn strip_row_preserving_wrappers(plan: &Arc) -> Option<&FilteredReadExec> { + let mut current: &dyn ExecutionPlan = plan.as_ref(); + loop { + if let Some(filtered_read) = current.as_any().downcast_ref::() { + return Some(filtered_read); + } + let next: &Arc = + if let Some(inner) = current.as_any().downcast_ref::() { + inner.input() + } else if let Some(inner) = { + #[allow(deprecated)] + current.as_any().downcast_ref::() + } { + inner.input() + } else if let Some(inner) = current.as_any().downcast_ref::() { + inner.input() + } else if let Some(proj) = current.as_any().downcast_ref::() { + // Only walk through projections that are row-preserving: every + // output expression is a direct column reference back to the + // input. (Empty projections trivially qualify — DataFusion uses + // one when a `COUNT(*)`'s argument no longer needs any actual + // columns.) + let input_schema = proj.input().schema(); + let identity = proj.expr().iter().all(|projection_expr| { + projection_expr + .expr + .as_any() + .downcast_ref::() + .is_some_and(|c| c.name() == input_schema.field(c.index()).name()) + }); + if !identity { + return None; + } + proj.input() + } else { + return None; + }; + current = next.as_ref(); + } +} + +/// Walk a `ScalarIndexExpr` and intersect the per-leaf `fragment_bitmap`. +/// +/// Returns `None` if any leaf is missing a bitmap (coverage unknown). All +/// three combinators (`And`, `Or`, `Not`) reduce to "every leaf must cover the +/// fragment for us to give a definitive answer about it" — i.e. intersection. +fn collect_coverage(expr: &ScalarIndexExpr) -> Option { + match expr { + ScalarIndexExpr::Not(inner) => collect_coverage(inner), + ScalarIndexExpr::And(lhs, rhs) | ScalarIndexExpr::Or(lhs, rhs) => { + let l = collect_coverage(lhs)?; + let r = collect_coverage(rhs)?; + Some(l & r) + } + ScalarIndexExpr::Query(search) => search.fragment_bitmap.clone(), + } +} + +/// Returns `true` if `af` is `COUNT()` with no DISTINCT. +fn is_count_star(af: &Arc) -> bool { + if af.fun().name() != "count" { + return false; + } + if af.is_distinct() { + return false; + } + let args = af.expressions(); + if args.len() != 1 { + return false; + } + let Some(lit) = args[0].as_any().downcast_ref::() else { + return false; + }; + // `COUNT(NULL)` would always return 0; rule it out so we don't accidentally + // produce a wrong answer if the planner ever lets it through. + !lit.value().is_null() +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::datatypes::{Int64Type, UInt64Type}; + use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; + use datafusion::physical_plan::{ExecutionPlan, displayable}; + use futures::TryStreamExt; + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::gen_batch; + use lance_index::IndexType; + use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; + + use super::*; + use crate::Dataset; + use crate::dataset::scanner::AggregateExpr; + use crate::index::DatasetIndexExt; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + struct Fixture { + dataset: Arc, + _tmp: TempStrDir, + } + + /// 4 fragments × 10 rows, ascending `ordered` column with a BTree index. + async fn make_fixture() -> Fixture { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Fixture { + dataset: Arc::new(dataset), + _tmp: tmp, + } + } + + fn plan_contains_pushdown(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found + } + + fn plan_contains_union(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found + } + + async fn run_count( + scanner: &mut crate::dataset::scanner::Scanner, + ) -> (Arc, i64) { + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let stream = datafusion::physical_plan::execute_stream( + plan.clone(), + Arc::new(datafusion::execution::TaskContext::default()), + ) + .unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!( + batches.len(), + 1, + "count plan emitted {} batches", + batches.len() + ); + let count = batches[0] + .column(0) + .as_any() + .downcast_ref::>() + .expect("count column should be Int64") + .value(0); + (plan, count) + } + + #[tokio::test] + async fn rule_fires_on_unfiltered_count_star() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 40); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + !plan_contains_union(&plan), + "no union expected for unfiltered count, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_fires_when_filter_fully_indexed() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + scanner.filter("ordered < 25").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 25); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + !plan_contains_union(&plan), + "no union expected when index covers every fragment, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_emits_split_plan_for_partial_index_coverage() { + // Build index over 4 fragments, then append a 5th — the index now + // covers a strict subset of the dataset. The rule must split into a + // pushdown branch over the indexed fragments and a scan branch over + // the rest, then sum the partials. + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + let extra = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(10), + lance_datagen::BatchCount::from(1), + ); + let dataset = Dataset::write( + extra, + tmp.as_str(), + Some(WriteParams { + mode: crate::dataset::WriteMode::Append, + max_rows_per_file: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("ordered < 100").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + // 5 fragments × 10 rows, all match `< 100`. + assert_eq!(count, 50); + assert!( + plan_contains_pushdown(&plan), + "expected pushdown branch in split plan: {}", + displayable(plan.as_ref()).indent(true) + ); + assert!( + plan_contains_union(&plan), + "expected UnionExec for partial-coverage split, got: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_with_stable_row_ids() { + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset_with_params( + tmp.as_str(), + FragmentCount::from(2), + FragmentRowCount::from(10), + Some(WriteParams { + max_rows_per_file: 10, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset.delete("ordered = 0").await.unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 19); + assert!( + !plan_contains_pushdown(&plan), + "rule must not fire under stable row IDs, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_when_filter_needs_refine() { + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .col("unindexed", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("unindexed > 5").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 34); + assert!( + !plan_contains_pushdown(&plan), + "rule should not fire with non-indexed filter, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_when_index_is_inexact() { + // Zonemap-style indices return AtMost (over-approximation) and set + // ScalarIndexSearch.needs_recheck = true. CountFromMaskExec + // ignores the discriminant on the prefilter batch, so firing the + // rule against an inexact index would silently overcount. The rule + // must refuse — and the scan path with its recheck still answers + // correctly. + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(10), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::ZoneMap, + None, + &ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap), + true, + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + scanner.filter("ordered < 25").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 25); + assert!( + !plan_contains_pushdown(&plan), + "rule must not fire when the index produces inexact (needs_recheck) results, \ + got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_skips_count_with_group_by() { + let fixture = make_fixture().await; + let mut scanner = fixture.dataset.scan(); + scanner + .aggregate( + AggregateExpr::builder() + .group_by("ordered") + .count_star() + .build(), + ) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "rule should not fire for GROUP BY: {}", + displayable(plan.as_ref()).indent(true) + ); + } +} diff --git a/rust/lance/src/io/exec/optimizer.rs b/rust/lance/src/io/exec/optimizer.rs index f031e10ce19..96a93823424 100644 --- a/rust/lance/src/io/exec/optimizer.rs +++ b/rust/lance/src/io/exec/optimizer.rs @@ -171,6 +171,10 @@ impl PhysicalOptimizerRule for SimplifyProjection { pub fn get_physical_optimizer() -> PhysicalOptimizer { PhysicalOptimizer::with_rules(vec![ + // Rewrite `COUNT(*)`-style aggregates into CountFromMaskExec so they + // can be answered without scanning column data. Runs before the + // generic rules so they don't see the rewritten subtree. + Arc::new(crate::io::exec::count_pushdown::CountPushdown), Arc::new(crate::io::exec::optimizer::CoalesceTake), Arc::new(crate::io::exec::optimizer::SimplifyProjection), // Push down limit into FilteredReadExec and other Execs via with_fetch() diff --git a/rust/lance/src/io/exec/scalar_index.rs b/rust/lance/src/io/exec/scalar_index.rs index ade4995fb4b..cc2abb6cc30 100644 --- a/rust/lance/src/io/exec/scalar_index.rs +++ b/rust/lance/src/io/exec/scalar_index.rs @@ -116,6 +116,7 @@ impl ScalarIndexExec { &self.dataset } + /// The parsed scalar-index expression this node will evaluate. pub fn expr(&self) -> &ScalarIndexExpr { &self.expr } @@ -380,6 +381,7 @@ impl MapIndexExec { index_type: String::new(), query: Arc::new(SargableQuery::IsIn(index_vals)), needs_recheck: false, + fragment_bitmap: None, }); let query_result = query.evaluate(dataset.as_ref(), metrics.as_ref()).await?; if !query_result.is_exact() { @@ -848,6 +850,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); let fragments = dataset.fragments().clone(); @@ -892,6 +895,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); let verify = async |plan: ScalarIndexExec, schema: Arc| { @@ -943,6 +947,7 @@ mod tests { Bound::Excluded(ScalarValue::UInt64(Some(47))), )), needs_recheck: false, + fragment_bitmap: None, }); // These plans aren't even valid but it appears we defer all work (even validation) until diff --git a/rust/lance/tests/count_pushdown/mod.rs b/rust/lance/tests/count_pushdown/mod.rs new file mode 100644 index 00000000000..aaa3f5f539e --- /dev/null +++ b/rust/lance/tests/count_pushdown/mod.rs @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! End-to-end integration tests for [`CountPushdown`] when DataFusion +//! does the planning (i.e. an aggregate built from SQL through the public +//! [`LanceTableProvider`] surface), as opposed to going through +//! `Scanner::create_plan`. +//! +//! The plan shape DataFusion produces for a SQL aggregate differs from the +//! scanner's: it emits `AggregateExec(Final) → CoalescePartitionsExec → +//! AggregateExec(Partial) → LanceTableScan` rather than a single +//! `AggregateExec(Single)`. This file pins that down so future aggregate- +//! pushdown categories can be added here with the test scaffolding already +//! in place. + +use std::sync::Arc; + +use arrow::datatypes::UInt64Type; +use arrow_array::types::Int64Type; +use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::{ExecutionPlan, displayable}; +use datafusion::prelude::SessionContext; +use futures::TryStreamExt; +use lance::Dataset; +use lance::datafusion::LanceTableProvider; +use lance::dataset::WriteParams; +use lance::index::DatasetIndexExt; +use lance::io::exec::count_from_mask::CountFromMaskExec; +use lance::io::exec::count_pushdown::CountPushdown; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{BatchCount, RowCount, array, gen_batch}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; + +/// Build a 4-fragment, 10-row-per-fragment dataset with a BTREE index on `x`. +async fn make_indexed_dataset() -> (Arc, TempStrDir) { + let tmp = TempStrDir::default(); + let reader = gen_batch() + .col("x", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(4)); + let mut dataset = Dataset::write( + reader, + tmp.as_str(), + Some(WriteParams { + max_rows_per_file: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + (Arc::new(dataset), tmp) +} + +/// Build a `SessionContext` configured with the Lance physical optimizer rule +/// for aggregate pushdown, then register `dataset` under the name `t`. +fn lance_aware_context(dataset: Arc) -> SessionContext { + let state = SessionStateBuilder::new() + .with_default_features() + .with_physical_optimizer_rule(Arc::new(CountPushdown)) + .build(); + let ctx = SessionContext::new_with_state(state); + ctx.register_table( + "t", + Arc::new(LanceTableProvider::new(dataset, false, false)), + ) + .unwrap(); + ctx +} + +fn plan_contains_pushdown(plan: &Arc) -> bool { + let mut found = false; + plan.apply(|node| { + if node.as_any().is::() { + found = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + }) + .unwrap(); + found +} + +async fn execute_count(plan: Arc) -> i64 { + let stream = datafusion::physical_plan::execute_stream( + plan, + Arc::new(datafusion::execution::TaskContext::default()), + ) + .unwrap(); + let batches: Vec<_> = stream.try_collect().await.unwrap(); + let total: i64 = batches + .iter() + .map(|b| { + b.column(0) + .as_any() + .downcast_ref::>() + .expect("count column should be Int64") + .iter() + .map(|v| v.unwrap_or(0)) + .sum::() + }) + .sum(); + total +} + +#[tokio::test] +async fn sql_count_star_with_indexed_filter() { + // SELECT COUNT(*) FROM t WHERE x < 25 + // + // The rule should fire on DataFusion's `AggregateExec(Partial)` node at + // the leaf of the aggregate pipeline, replacing the column scan with + // `CountFromMaskExec` while the outer `AggregateExec(Final)` keeps + // doing the cross-partition combine. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx + .sql("SELECT COUNT(*) FROM t WHERE x < 25") + .await + .unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + plan_contains_pushdown(&plan), + "expected CountFromMaskExec in SQL plan, got:\n{}", + displayable(plan.as_ref()).indent(true) + ); + assert_eq!(execute_count(plan).await, 25); +} + +#[tokio::test] +async fn sql_unfiltered_count_star_uses_statistics() { + // SELECT COUNT(*) FROM t with no filter is answered by DataFusion + // statically from LanceTableProvider's row-count statistic — never + // reaches an `AggregateExec` for our rule to look at. Pin that + // behaviour: the rule should not fire, and the answer is correct. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx.sql("SELECT COUNT(*) FROM t").await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "unfiltered COUNT(*) should be resolved from statistics, got:\n{}", + displayable(plan.as_ref()).indent(true) + ); + assert_eq!(execute_count(plan).await, 40); +} + +#[tokio::test] +async fn sql_count_distinct_does_not_fire_yet() { + // SELECT COUNT(DISTINCT x) FROM t WHERE x < 25 + // + // `is_count_star` rejects distinct, so this rule never fires for + // distinct counts — they belong to the mask-to-answer category and will + // need their own rule (e.g. over a bitmap-index dictionary). This test + // pins the not-firing behaviour and the scaffold for the future test. + let (dataset, _tmp) = make_indexed_dataset().await; + let ctx = lance_aware_context(dataset); + + let df = ctx + .sql("SELECT COUNT(DISTINCT x) FROM t WHERE x < 25") + .await + .unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + assert!( + !plan_contains_pushdown(&plan), + "CountFromMaskExec must not fire for COUNT(DISTINCT) yet: \n{}", + displayable(plan.as_ref()).indent(true) + ); + // Correctness via the scan path: values 0..25 are all distinct. + assert_eq!(execute_count(plan).await, 25); +} diff --git a/rust/lance/tests/integration_tests.rs b/rust/lance/tests/integration_tests.rs index 81c2535dd9c..7a6d3e71ca4 100644 --- a/rust/lance/tests/integration_tests.rs +++ b/rust/lance/tests/integration_tests.rs @@ -3,6 +3,7 @@ // NOTE: we only create one integration test binary, to keep compilation overhead down. +mod count_pushdown; #[cfg(feature = "slow_tests")] mod query; #[cfg(feature = "slow_tests")] From bd31c79cb1ca69f671b55cc82e97753fd9345e0e Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Wed, 3 Jun 2026 16:27:49 -0500 Subject: [PATCH 009/177] feat: dedup FTS results across LSM tiers in LsmFtsSearchPlanner (#7066) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Adds within-tier PK dedup to `LsmFtsSearchPlanner` so an FTS query over the LSM tiers never surfaces the same primary key twice. Previously the planner unioned per-source FTS results with **no** cross-source dedup, so a PK present in multiple tiers — or updated within the active memtable — surfaced more than once. This ports the dedup the vector planner (`LsmVectorSearchPlanner`) already does: - **Flushed sources**: `PkHashFilterExec` block-list (`compute_source_block_lists`) drops rows superseded by a newer generation. - **Active memtable**: emit `_rowid` and wrap in `WithinSourceDedupExec(KeepMaxRowAddr)` to collapse duplicate-PK appends — the FTS inverted index is append-only, so an in-memtable update leaves both versions searchable. - `with_overfetch_factor` builder so a blocked source fetches `ceil(k * factor)` and still yields `k` live rows after the block-list filter. ## Known limitation (follow-up) A *predicate-crossing* update within the active memtable — where the newest version no longer matches the query — can still leak the stale version, because `WithinSourceDedupExec` only dedups among rows the index returned. This is the same gap the vector active arm already documents ("a fresh version evicted from the over-fetched top-k still leaks"). The fix — a predicate-independent newest-per-PK recency filter over the active memtable, shared by the vector + FTS arms — is a separate PR. ## Context Enables FTS over the WAL fresh tier in sophon (lancedb/sophon#6146). Draft pending that integration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 (1M context) --- .../src/dataset/mem_wal/scanner/fts_search.rs | 170 +++++++++++++++++- 1 file changed, 167 insertions(+), 3 deletions(-) diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index aa086a80e66..e3ef44d8b1a 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -44,13 +44,15 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::union::UnionExec; -use lance_core::{Error, Result, is_system_column}; +use lance_core::{Error, ROW_ID, Result, is_system_column}; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery; use tracing::instrument; +use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; +use super::exec::{DedupDirection, PkHashFilterExec, WithinSourceDedupExec}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; @@ -61,6 +63,11 @@ use crate::session::Session; /// require an import for one string constant. pub const SCORE_COLUMN: &str = "_score"; +/// Default over-fetch multiple for blocked sources. `1.0` keeps cross-generation +/// dedup on with no over-fetch; callers (e.g. the sophon WAL handler) raise it +/// so a blocked source still yields `k` live rows after the block-list filter. +const DEFAULT_OVERFETCH_FACTOR: f64 = 1.0; + /// Plans local-scoring FTS queries over LSM data. pub struct LsmFtsSearchPlanner { collector: LsmDataSourceCollector, @@ -70,6 +77,8 @@ pub struct LsmFtsSearchPlanner { session: Option>, /// Cache of opened flushed-generation datasets. flushed_cache: Option>, + /// Over-fetch multiple for blocked sources (clamped to `>= 1.0`). + overfetch_factor: f64, } impl LsmFtsSearchPlanner { @@ -85,9 +94,17 @@ impl LsmFtsSearchPlanner { base_schema, session: None, flushed_cache: None, + overfetch_factor: DEFAULT_OVERFETCH_FACTOR, } } + /// Set the over-fetch multiple for blocked sources so they still yield `k` + /// live rows after cross-generation block-list filtering. Clamped to `>= 1.0`. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = factor; + self + } + /// Thread a session into flushed-generation opens so the first open /// populates the shared index / file-metadata caches. pub fn with_session(mut self, session: Arc) -> Self { @@ -137,12 +154,63 @@ impl LsmFtsSearchPlanner { return self.empty_plan(&target_schema); } + // Per-source PK-hash block sets for cross-generation dedup (NEWER(G) + // per shard; base = union of all gens). Query-type-agnostic — same + // call the vector planner makes. `Box::pin` keeps the future off + // `clippy::large_futures`. + let block_lists = Box::pin(compute_source_block_lists( + &sources, + &self.pk_columns, + self.session.as_ref(), + self.flushed_cache.as_ref(), + )) + .await?; + let overfetch = self.overfetch_factor.max(1.0); + let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); for source in &sources { + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists.get(&(source.shard_id(), source.generation())); + // Over-fetch a blocked source so the post-filter still yields k live + // rows. The active arm returns all matches (no builder limit), so its + // within-source dedup needs no over-fetch hint. + let fetch_k = if blocked.is_some() { + ((k as f64) * overfetch).ceil() as usize + } else { + k + }; + let plan = self - .build_source_plan(source, column, &query, k, projection) + .build_source_plan(source, column, &query, fetch_k, projection, is_active) .await?; - let normalized = project_to_canonical(plan, &target_schema)?; + + // Dedup, mirroring LsmVectorSearchPlanner: + // * active: collapse duplicate-PK appends to the newest insert + // (larger _rowid = inserted later). The FTS index is append-only, + // so an in-memtable update leaves both versions searchable. + // * flushed/base: drop rows superseded by a newer generation via the + // block-list (within-gen is handled by the flushed deletion vector). + let deduped = if is_active { + Arc::new(WithinSourceDedupExec::new( + plan, + self.pk_columns.clone(), + ROW_ID, + DedupDirection::KeepMaxRowAddr, + )) as Arc + } else if let Some(set) = blocked { + Arc::new(PkHashFilterExec::new( + plan, + self.pk_columns.clone(), + set.clone(), + k, + )) as Arc + } else { + plan + }; + + // Normalize to canonical. This also drops the active arm's _rowid, + // which the canonical FTS schema omits — it served only the dedup. + let normalized = project_to_canonical(deduped, &target_schema)?; per_source_plans.push(normalized); } @@ -195,6 +263,7 @@ impl LsmFtsSearchPlanner { query: &FullTextSearchQuery, k: usize, projection: Option<&[String]>, + emit_row_id: bool, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -232,6 +301,11 @@ impl LsmFtsSearchPlanner { MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone()); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); + // Emit `_rowid` (row position) so the planner can collapse + // duplicate-PK appends via WithinSourceDedupExec before the union. + if emit_row_id { + scanner.with_row_id(); + } // `MemTableScanner::full_text_search` takes a raw match // string; richer query shapes (phrase/boolean/fuzzy) can // be plumbed through once the MemTable scanner accepts a @@ -561,4 +635,94 @@ mod tests { } } } + + #[tokio::test] + async fn local_mode_active_dedups_updated_pk_keeping_newest() { + // The active memtable is an append log and the FTS index is + // append-only, so a PK updated before flush is searchable as two + // row-positions. WithinSourceDedupExec(KeepMaxRowAddr) must collapse + // them to the newest insert. Without it the same PK would surface + // twice (criterion 2 violation). + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // First append (positions 0,1): id=1 is the stale version of the PK. + let batch_old = make_batch(&schema, &[1, 2], &["lance stale version", "other doc"]); + batch_store.append(batch_old.clone()).unwrap(); + indexes + .insert_with_batch_position(&batch_old, 0, Some(0)) + .unwrap(); + + // Second append (position 2): id=1 updated — same PK, later row. + let batch_new = make_batch(&schema, &[1], &["lance fresh version"]); + batch_store.append(batch_new.clone()).unwrap(); + indexes + .insert_with_batch_position(&batch_new, 2, Some(1)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("lance".to_string()), + 10, + None, + ) + .await + .expect("planner should produce an active-only plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut rows: Vec<(i32, String)> = Vec::new(); + for b in &batches { + let ids = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let texts = b + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), texts.value(i).to_string())); + } + } + + // id=1 must appear exactly once, and it must be the *newest* version. + let id1: Vec<&(i32, String)> = rows.iter().filter(|(id, _)| *id == 1).collect(); + assert_eq!( + id1.len(), + 1, + "updated PK id=1 must be deduped to one row; got {rows:?}" + ); + assert_eq!( + id1[0].1, "lance fresh version", + "dedup must keep the newest (max row-position) version" + ); + } } From 71990be8152f6c7969df0b487bb3319d7a839ff0 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 3 Jun 2026 15:22:48 -0700 Subject: [PATCH 010/177] perf: block-max pruning on the disjunctive WAND path (#7089) A single-term match query runs ~3x slower as an OR than as the logically identical AND, returning byte-identical top-k. The conjunctive WAND path skips blocks whose block-max cannot reach the top-k threshold; the disjunctive path had the machinery but never used it, so it scored every posting entry one at a time. This adds the same skip to the union path: when the block-max upper bound over every iterator overlapping the current window cannot beat the threshold, it advances to the next block boundary instead of scoring each document. The bound includes the `head` iterators (later documents still inside the window), so a skip only fires when no document in the window can qualify and results are unchanged. Single-term OR drops to AND-level latency (20x+ faster on common terms in a Zipf corpus). Phrase queries run on the AND path and are unaffected. Covered by `test_or_single_term_block_skip_matches_and`, which asserts OR and AND return the same top-k and that pruning skips a block. Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar/inverted/wand.rs | 120 +++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 2f14ecc7da0..995e8c0bb1c 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -862,6 +862,19 @@ impl<'a, S: Scorer> Wand<'a, S> { continue; } + // Block-Max WAND pruning: skip the whole window when its score upper + // bound cannot reach the top-k threshold. + if self.threshold > 0.0 && self.or_block_window_max() <= self.threshold { + // On the final block `up_to` is the `u64::MAX` sentinel; step once + // there to avoid seeking past the valid doc id range. + let skip_to = match self.up_to { + Some(up_to) if up_to < u32::MAX as u64 => up_to + 1, + _ => target + 1, + }; + self.push_back_leads(skip_to); + continue; + } + let Some(doc) = self.lead.first().and_then(|posting| posting.doc()) else { self.push_back_leads(target + 1); continue; @@ -1035,6 +1048,24 @@ impl<'a, S: Scorer> Wand<'a, S> { } } + /// Upper bound on the score of any document in the window `[target, up_to]` + /// for a disjunction. Sums the block-max of every overlapping iterator: + /// `lead`, `head` (later docs still in the window, which + /// `can_target_beat_threshold` omits), and the tail via `tail_max_score`. + fn or_block_window_max(&self) -> f32 { + let lead: f32 = self + .lead + .iter() + .map(|posting| posting.block_max_score()) + .sum(); + let head: f32 = self + .head + .iter() + .map(|posting| posting.posting.block_max_score()) + .sum(); + lead + head + self.tail_max_score + } + fn can_target_beat_threshold(&mut self, target: u64) -> bool { if self.up_to.is_none_or(|up_to| target > up_to) { self.update_max_scores(target); @@ -1496,6 +1527,8 @@ mod tests { use arrow::buffer::ScalarBuffer; use rstest::rstest; + use std::sync::atomic::{AtomicUsize, Ordering}; + use super::*; use crate::scalar::inverted::scorer::IndexBM25Scorer; use crate::{ @@ -1542,6 +1575,23 @@ mod tests { } } + // Inverse-doc-length scorer that counts scored documents, so a test can + // assert that block-max pruning skipped blocks. + struct CountingScorer { + scored: Arc, + } + + impl Scorer for CountingScorer { + fn query_weight(&self, _token: &str) -> f32 { + 1.0 + } + + fn doc_weight(&self, freq: u32, doc_tokens: u32) -> f32 { + self.scored.fetch_add(1, Ordering::Relaxed); + freq as f32 / doc_tokens as f32 + } + } + fn generate_posting_list( doc_ids: Vec, max_score: f32, @@ -1721,6 +1771,76 @@ mod tests { assert!(result.is_ok()); } + #[test] + fn test_or_single_term_block_skip_matches_and() { + // Hot docs occupy the middle block; the flanking blocks score far below + // the threshold. A single-term disjunction must skip them yet return what + // the conjunctive path returns. + let total = 3 * BLOCK_SIZE as u32; + let hot = BLOCK_SIZE as u32..BLOCK_SIZE as u32 + 12; + + let mut docs = DocSet::default(); + for row_id in 0..total { + // hot docs get distinct scores 1/1..1/12; the rest score 0.001 + let doc_tokens = if hot.contains(&row_id) { + row_id - hot.start + 1 + } else { + 1000 + }; + docs.append(row_id as u64, doc_tokens); + } + + let params = FtsSearchParams::new().with_limit(Some(10)); + let run = |operator| { + let scored = Arc::new(AtomicUsize::new(0)); + let posting = PostingIterator::with_query_weight( + String::from("term"), + 0, + 0, + 1.0, + generate_posting_list( + (0..total).collect(), + 1.0, + Some(vec![0.001, 1.0, 0.001]), + true, + ), + docs.len(), + ); + let mut wand = Wand::new( + operator, + std::iter::once(posting), + &docs, + CountingScorer { + scored: scored.clone(), + }, + ); + let hits = wand + .search( + ¶ms, + Arc::new(RowAddrMask::default()), + &NoOpMetricsCollector, + ) + .unwrap(); + let mut row_ids = hits.iter().map(|hit| hit.row_id).collect::>(); + row_ids.sort_unstable(); + (row_ids, scored.load(Ordering::Relaxed)) + }; + + let (or_hits, or_scored) = run(Operator::Or); + let (and_hits, _) = run(Operator::And); + + let expected = (hot.start..hot.start + 10) + .map(u64::from) + .collect::>(); + assert_eq!(or_hits, expected, "OR must return the top-k"); + assert_eq!(or_hits, and_hits, "OR and AND must agree for a single term"); + // Without pruning OR scores all `total` docs; with it the cold blocks are skipped. + assert!( + or_scored <= 2 * BLOCK_SIZE, + "expected pruning to skip a block, but scored {or_scored} of {total}", + ); + } + #[test] fn test_wand_new_uses_precomputed_query_weight() { let mut docs = DocSet::default(); From f6e2bf40fae4c472c046c100f6a73ceae49a3b16 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 3 Jun 2026 16:59:39 -0700 Subject: [PATCH 011/177] perf(fts): share a top-k threshold across partitions during WAND (#7062) Each partition ran WAND from a cold threshold and built its own local top-k, so a common term paid full block-max work in every partition. Share an Arc floor across a query's partitions: each publishes its local k-th (fetch_max) and reads it back as a pruning floor. The k-th of the union is >= any single partition's k-th, so the shared value is a lower bound on the global k-th and never drops a real top-k doc. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar/inverted/index.rs | 13 +- rust/lance-index/src/scalar/inverted/wand.rs | 131 +++++++++++++++++- 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 520d46a5097..8acfffb3486 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -3,7 +3,7 @@ use std::fmt::{Debug, Display}; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::{ cmp::{Reverse, min}, collections::BinaryHeap, @@ -651,6 +651,11 @@ impl InvertedIndex { let mask = prefilter.mask(); let mut candidates = BinaryHeap::new(); + // Shared top-k floor across this query's partitions. Seeded to -inf so + // the first real score wins; each partition publishes its local k-th + // and prunes against the running global k-th (a lower bound on the true + // global k-th — see `Wand::shared_threshold`). + let shared_threshold = Arc::new(AtomicU32::new(f32::NEG_INFINITY.to_bits())); let parts = self .partitions .iter() @@ -660,6 +665,7 @@ impl InvertedIndex { let params = params.clone(); let mask = mask.clone(); let metrics = metrics.clone(); + let shared_threshold = shared_threshold.clone(); async move { let postings = part .load_posting_lists(tokens.as_ref(), params.as_ref(), metrics.as_ref()) @@ -687,6 +693,7 @@ impl InvertedIndex { mask, postings, metrics.as_ref(), + shared_threshold, )?; Ok(PartitionCandidates { tokens_by_position, @@ -1252,6 +1259,7 @@ impl InvertedPartition { mask: Arc, postings: Vec, metrics: &dyn MetricsCollector, + shared_threshold: Arc, ) -> Result> { if postings.is_empty() { return Ok(Vec::new()); @@ -1259,7 +1267,8 @@ impl InvertedPartition { // let local_metrics = LocalMetricsCollector::default(); let scorer = IndexBM25Scorer::new(std::iter::once(self)); - let mut wand = Wand::new(operator, postings.into_iter(), &self.docs, scorer); + let mut wand = Wand::new(operator, postings.into_iter(), &self.docs, scorer) + .with_shared_threshold(shared_threshold); let hits = wand.search(params, mask, metrics)?; // local_metrics.dump_into(metrics); Ok(hits) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 995e8c0bb1c..96cc8146fa6 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors use std::ops::Deref; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, LazyLock}; use std::{cell::UnsafeCell, collections::BinaryHeap}; use std::{cmp::Reverse, fmt::Debug}; @@ -576,6 +577,22 @@ pub struct Wand<'a, S: Scorer> { and_last_doc: Option, docs: &'a DocSet, scorer: S, + // Shared cross-partition top-k floor. Each partition publishes its local + // k-th score (`atomic_store_max_f32`) and prunes against the running value + // -- a lower bound on the global k-th, so it never drops a real top-k doc. + shared_threshold: Option>, +} + +/// Monotonically raise an f32 stored in an `AtomicU32` to `val`. CAS loop (not a +/// bit-max) so it stays correct for negative scores -- BM25 idf can go negative. +fn atomic_store_max_f32(slot: &AtomicU32, val: f32) { + let mut cur = slot.load(Ordering::Relaxed); + while val > f32::from_bits(cur) { + match slot.compare_exchange_weak(cur, val.to_bits(), Ordering::Relaxed, Ordering::Relaxed) { + Ok(_) => break, + Err(actual) => cur = actual, + } + } } // we were using row id as doc id in the past, which is u64, @@ -622,6 +639,38 @@ impl<'a, S: Scorer> Wand<'a, S> { and_last_doc: None, docs, scorer, + shared_threshold: None, + } + } + + /// Share one cross-partition top-k floor across a query's partitions. + pub(crate) fn with_shared_threshold(mut self, shared: Arc) -> Self { + self.shared_threshold = Some(shared); + self + } + + /// Set the pruning threshold from this partition's k-th best, raised to the + /// shared cross-partition floor when one is attached. + fn update_threshold(&mut self, local_kth: f32, wand_factor: f32) { + let mut t = local_kth * wand_factor; + if let Some(shared) = self.shared_threshold.as_ref() { + atomic_store_max_f32(shared, local_kth); + let g = f32::from_bits(shared.load(Ordering::Relaxed)) * wand_factor; + if g > t { + t = g; + } + } + self.threshold = t; + } + + /// Raise the local threshold to the shared cross-partition floor, picking up + /// updates published by sibling partitions. + fn raise_to_shared_floor(&mut self, wand_factor: f32) { + if let Some(shared) = self.shared_threshold.as_ref() { + let g = f32::from_bits(shared.load(Ordering::Relaxed)) * wand_factor; + if g > self.threshold { + self.threshold = g; + } } } @@ -651,7 +700,11 @@ impl<'a, S: Scorer> Wand<'a, S> { let mut candidates = BinaryHeap::with_capacity(std::cmp::min(limit, BLOCK_SIZE * 10)); let mut num_comparisons = 0; - while let Some((doc, mut score)) = self.next()? { + loop { + self.raise_to_shared_floor(params.wand_factor); + let Some((doc, mut score)) = self.next()? else { + break; + }; num_comparisons += 1; let row_id = match &doc { @@ -696,12 +749,14 @@ impl<'a, S: Scorer> Wand<'a, S> { if candidates.len() < limit { candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { - self.threshold = candidates.peek().unwrap().0.0.score.0 * params.wand_factor; + let kth = candidates.peek().unwrap().0.0.score.0; + self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); - self.threshold = candidates.peek().unwrap().0.0.score.0 * params.wand_factor; + let kth = candidates.peek().unwrap().0.0.score.0; + self.update_threshold(kth, params.wand_factor); } if self.operator == Operator::Or { self.push_back_leads(doc.doc_id() + 1); @@ -802,12 +857,14 @@ impl<'a, S: Scorer> Wand<'a, S> { if candidates.len() < limit { candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { - self.threshold = candidates.peek().unwrap().0.0.score.0 * params.wand_factor; + let kth = candidates.peek().unwrap().0.0.score.0; + self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); - self.threshold = candidates.peek().unwrap().0.0.score.0 * params.wand_factor; + let kth = candidates.peek().unwrap().0.0.score.0; + self.update_threshold(kth, params.wand_factor); } self.advance_lead_to_head(doc_id + 1); @@ -1706,6 +1763,70 @@ mod tests { assert_eq!(result.len(), 0); // Should not panic } + /// The shared floor prunes partitions that can't reach the global top-k: a + /// high-scoring partition sets the floor and the rest skip their blocks. + /// (Result correctness is covered by the FTS search tests, since sharing is + /// always on.) + #[test] + fn cross_partition_threshold_sharing_prunes() { + use crate::metrics::MetricsCollector; + use std::sync::atomic::AtomicUsize; + + #[derive(Default)] + struct CountComparisons(AtomicUsize); + impl MetricsCollector for CountComparisons { + fn record_parts_loaded(&self, _: usize) {} + fn record_index_loads(&self, _: usize) {} + fn record_comparisons(&self, n: usize) { + self.0.fetch_add(n, Ordering::Relaxed); + } + } + + let params = FtsSearchParams::default().with_limit(Some(10)); + let part_docs = 4 * BLOCK_SIZE as u32; + // One high-scoring partition (weight 10) then 7 low-scoring ones. + let parts: Vec<(f32, std::ops::Range)> = std::iter::once((10.0, 0..part_docs)) + .chain((1..8).map(|i| (1.0, i * part_docs..(i + 1) * part_docs))) + .collect(); + + let new_floor = || Arc::new(AtomicU32::new(f32::NEG_INFINITY.to_bits())); + + // Total comparisons across all partitions. `Some(floor)` makes every + // partition share that one floor; `None` gives each its own. + let total_comparisons = |shared_floor: Option<&Arc>| -> usize { + let metrics = CountComparisons::default(); + for (qw, rows) in &parts { + let mut docs = DocSet::default(); + for d in rows.clone() { + docs.append(d as u64, 1); + } + let postings = vec![PostingIterator::with_query_weight( + String::from("t"), + 0, + 0, + *qw, + generate_posting_list(rows.clone().collect(), *qw, None, false), + docs.len(), + )]; + let floor = shared_floor.cloned().unwrap_or_else(new_floor); + Wand::new(Operator::Or, postings.into_iter(), &docs, UnitScorer) + .with_shared_threshold(floor) + .search(¶ms, Arc::new(RowAddrMask::default()), &metrics) + .unwrap(); + } + metrics.0.load(Ordering::Relaxed) + }; + + let one_floor = new_floor(); + let with_shared_floor = total_comparisons(Some(&one_floor)); + let with_private_floors = total_comparisons(None); + assert!( + with_shared_floor < with_private_floors, + "shared floor should prune comparisons: \ + shared={with_shared_floor} private={with_private_floors}" + ); + } + #[test] fn test_posting_iterator_next_compressed_partition_point() { let mut docs = DocSet::default(); From f154b4e84942e5b8028f7f4a7a77b1d963107202 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Thu, 4 Jun 2026 00:58:30 +0000 Subject: [PATCH 012/177] chore: release beta version 8.0.0-beta.2 --- .bumpversion.toml | 2 +- Cargo.lock | 76 +++++++++++++++++++-------------------- Cargo.toml | 42 +++++++++++----------- java/lance-jni/Cargo.lock | 42 +++++++++++----------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 42 +++++++++++----------- python/Cargo.toml | 2 +- 8 files changed, 105 insertions(+), 105 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index ee405e7d689..5e79b8cad2c 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.1" +current_version = "8.0.0-beta.2" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index dc8146a8e01..37f106596bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -147,7 +147,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -158,7 +158,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1491,7 +1491,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2853,7 +2853,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3005,7 +3005,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4195,7 +4195,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4431,7 +4431,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "all_asserts", "approx", @@ -4534,7 +4534,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4582,7 +4582,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4591,7 +4591,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4661,7 +4661,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4681,7 +4681,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4726,7 +4726,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "all_asserts", "arrow", @@ -4752,7 +4752,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4792,7 +4792,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4806,7 +4806,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "approx", "arc-swap", @@ -4884,7 +4884,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4933,7 +4933,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "approx", "arrow-array", @@ -4953,7 +4953,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4965,7 +4965,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -4981,7 +4981,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -5037,7 +5037,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -5056,7 +5056,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -5103,7 +5103,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "proc-macro2", "quote", @@ -5112,7 +5112,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -5125,7 +5125,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5137,7 +5137,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "clap", "lance-core", @@ -5715,7 +5715,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -6972,7 +6972,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -7693,7 +7693,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -7752,7 +7752,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -8297,7 +8297,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52d1cfed4120b4d927bf7c0f86d2087a4a7d6027c906d9f9d525a80573b9be51" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -8635,7 +8635,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -9666,7 +9666,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -10294,9 +10294,9 @@ checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", diff --git a/Cargo.toml b/Cargo.toml index 5044498dd41..f144c3f2d19 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.1", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.1", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.1", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.1", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.1", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.1", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.1", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.1", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.1", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.1", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.1", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.1", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.1", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.2", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.2", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.2", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.2", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.2", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.2", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.2", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.2", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.2", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.2", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.2", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.2", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.2", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.0" -lance-select = { version = "=8.0.0-beta.1", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.1", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.1", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.1", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.1", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.2", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.2", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.2", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.2", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.2", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.1", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.2", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.1", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.2", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 79d554807bd..9ae7f003df8 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3721,7 +3721,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -3795,7 +3795,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -3837,7 +3837,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -3846,7 +3846,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -3881,7 +3881,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3913,7 +3913,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3931,7 +3931,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3966,7 +3966,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3997,7 +3997,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4011,7 +4011,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4080,7 +4080,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4122,7 +4122,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4158,7 +4158,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4174,7 +4174,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4186,7 +4186,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4246,7 +4246,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4284,7 +4284,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -8477,9 +8477,9 @@ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index e1478b27aeb..8649360a870 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 4639cd74d2b..579f8e52430 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.1 + 8.0.0-beta.2 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 162a221907f..d1484c47f52 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4087,7 +4087,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4162,7 +4162,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4204,7 +4204,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4213,7 +4213,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4248,7 +4248,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4298,7 +4298,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4333,7 +4333,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4364,7 +4364,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4378,7 +4378,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4448,7 +4448,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4490,7 +4490,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4506,7 +4506,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4518,7 +4518,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4562,7 +4562,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4578,7 +4578,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4618,7 +4618,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6064,7 +6064,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -9231,9 +9231,9 @@ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" dependencies = [ "stable_deref_trait", "yoke-derive", diff --git a/python/Cargo.toml b/python/Cargo.toml index 2ea0d46764b..68dcc5f65e9 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.1" +version = "8.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 38d289d7e697d45d545d377899ae265353dc6819 Mon Sep 17 00:00:00 2001 From: George Stamatakis <126914070+gstamatakis95@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:58:34 +0200 Subject: [PATCH 013/177] feat(python): add shared RaBitQ rotation for distributed IVF_RQ builds (#7014) Closes: #7012 ## What Distributed `IVF_RQ` builds work in the Rust engine (#6359) but could not be driven from Python because the RaBitQ rotation could not be pinned across workers. Each per-fragment build generated its own random rotation, so segments rotated vectors differently, their binary codes were not comparable, and merging corrupted the index. This adds a way to mint one rotation, broadcast it, and reuse it in every per-fragment build, mirroring how `pq_codebook` is injected. ## Changes - Add `build_rq_rotation(dimension, num_bits=1, rotation_type="fast", dtype="float32")` that returns one rotation as a JSON string. - Add an `rq_rotation` parameter to `create_index_uncommitted`, parsed into a new transient `RQBuildParams.rotation` field and consumed by `RabitQuantizer::build`. - `build()` reuses the supplied rotation instead of generating a random one, after validating `num_bits`, `code_dim`, and the signs length. ## Notes - Only the fast rotation is supported because its sign vector is JSON serializable. - The matrix rotation keeps a dense matrix in a binary buffer that the JSON wire format drops, so it is rejected with a clear error. - The params proto, the segment builder, and the merge and commit paths are unchanged. ## Tests - Rust unit tests for shared-rotation reuse, identical codes across builds, mismatch and bad-input rejection, and the matrix-via-JSON rejection. - A Python integration test that builds two `IVF_RQ` segments on separate fragments with one shared rotation, merges, commits, and queries. --- python/python/lance/dataset.py | 12 ++ .../python/lance/lance/indices/__init__.pyi | 5 + python/python/tests/test_vector_index.py | 45 +++++ python/src/dataset.rs | 8 + python/src/indices.rs | 56 ++++++ rust/lance-index/src/vector/bq.rs | 12 +- rust/lance-index/src/vector/bq/builder.rs | 160 +++++++++++++++++- rust/lance-index/src/vector/bq/rotation.rs | 6 +- rust/lance/src/index/vector.rs | 1 + 9 files changed, 302 insertions(+), 3 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 59c995de87b..931a09d6ce6 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3327,6 +3327,7 @@ def _create_index_impl( streaming_coreset_rate: Optional[int] = None, streaming_refine_passes: Optional[int] = None, skip_transpose: bool = False, + rabitq_model: Optional[str] = None, require_commit: bool = True, **kwargs, ) -> Index: @@ -3648,6 +3649,9 @@ def _create_index_impl( if skip_transpose: kwargs["skip_transpose"] = True + if rabitq_model is not None: + kwargs["rabitq_model"] = rabitq_model + # Add fragment_ids and index_uuid to kwargs if provided for # distributed indexing if fragment_ids is not None: @@ -3970,6 +3974,7 @@ def create_index_uncommitted( streaming_coreset_rate: Optional[int] = None, streaming_refine_passes: Optional[int] = None, skip_transpose: bool = False, + rabitq_model: Optional[str] = None, **kwargs, ) -> Index: """ @@ -4000,6 +4005,12 @@ def create_index_uncommitted( requirement: - ``fragment_ids`` must be provided + - ``rabitq_model`` (``IVF_RQ`` only): a JSON string produced by + ``lance.lance.indices.build_rq_model``. It must be identical across all + workers for their segments to be mergeable, since it pins the RaBitQ + rotation so every segment rotates vectors the same way. If omitted, each + call generates its own random rotation, which is only safe for a single, + non-merged segment. Returns ------- @@ -4063,6 +4074,7 @@ def create_index_uncommitted( streaming_coreset_rate=streaming_coreset_rate, streaming_refine_passes=streaming_refine_passes, skip_transpose=skip_transpose, + rabitq_model=rabitq_model, require_commit=False, **kwargs, ) diff --git a/python/python/lance/lance/indices/__init__.pyi b/python/python/lance/lance/indices/__init__.pyi index 384e2528e99..d28bdd123fc 100644 --- a/python/python/lance/lance/indices/__init__.pyi +++ b/python/python/lance/lance/indices/__init__.pyi @@ -59,6 +59,11 @@ def transform_vectors( pq_codebook: pa.Array, dst_uri: str, ): ... +def build_rq_model( + dimension: int, + num_bits: int = 1, + dtype: str = "float32", +) -> str: ... class IndexSegmentDescription: uuid: str diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 9df41dd8300..d8f13a01da6 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -3000,6 +3000,51 @@ def test_commit_existing_index_segments_accepts_index_metadata(tmp_path): assert 0 < len(results) <= 5 +def test_distributed_ivf_rq_shared_rotation(tmp_path): + """Two IVF_RQ segments built on separate fragments with one shared RaBitQ rotation + merge into a single committed, queryable index. The shared ``rabitq_model`` (from + ``lance.lance.indices.build_rq_model``) is what makes the independently built + segments mergeable.""" + from lance.lance import indices + + dim = 32 + ds = _make_sample_dataset_base( + tmp_path, "dist_rq_merge", n_rows=512, dim=dim, max_rows_per_file=256 + ) + frags = ds.get_fragments() + assert len(frags) == 2 + + ivf_model = IndicesBuilder(ds, "vector").train_ivf( + num_partitions=2, + distance_type="l2", + sample_rate=8, + ) + rabitq_model = indices.build_rq_model(dimension=dim, num_bits=1) + base_kwargs = { + "column": "vector", + "index_type": "IVF_RQ", + "num_partitions": 2, + "num_bits": 1, + "ivf_centroids": ivf_model.centroids, + "rabitq_model": rabitq_model, + } + first = ds.create_index_uncommitted( + **base_kwargs, + fragment_ids=[frags[0].fragment_id], + ) + second = ds.create_index_uncommitted( + **base_kwargs, + fragment_ids=[frags[1].fragment_id], + ) + + merged = ds.merge_existing_index_segments([first, second]) + ds = ds.commit_existing_index_segments("vector_idx", "vector", [merged]) + + q = np.random.rand(dim).astype(np.float32) + results = ds.to_table(nearest={"column": "vector", "q": q, "k": 5}) + assert 0 < len(results) <= 5 + + def test_commit_existing_index_segments_accepts_uncommitted_vector_segments(tmp_path): ds = _make_sample_dataset_base(tmp_path, "segment_commit_ds", 2000, 128) frags = ds.get_fragments() diff --git a/python/src/dataset.rs b/python/src/dataset.rs index f70a9c7b1fb..1c7db74ddb4 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -20,6 +20,7 @@ use blob::LanceBlobFile; use chrono::{Duration, TimeDelta, Utc}; use futures::{StreamExt, TryFutureExt}; use lance_index::vector::bq::RQBuildParams; +use lance_index::vector::bq::storage::RabitQuantizationMetadata; use log::error; use object_store::path::Path; use pyo3::exceptions::{PyStopIteration, PyTypeError}; @@ -4361,6 +4362,13 @@ fn prepare_vector_index_params( pq_params.codebook = Some(codebook.values().clone()) }; + if let Some(r) = kwargs.get_item("rabitq_model")? { + let json: String = r.extract()?; + let meta: RabitQuantizationMetadata = serde_json::from_str(&json) + .map_err(|e| PyValueError::new_err(format!("Invalid rabitq_model JSON: {e}")))?; + rq_params.rotation = Some(meta); + }; + if let Some(version) = kwargs.get_item("index_file_version")? { let version: String = version.extract()?; index_file_version = IndexFileVersion::try_from(&version) diff --git a/python/src/indices.rs b/python/src/indices.rs index fe988206117..6efb8538d08 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -302,6 +302,61 @@ fn train_pq_model<'py>( codebook.to_pyarrow(py) } +/// Mint one RaBitQ rotation and return it as a JSON string. +/// +/// Distributed IVF_RQ builds must pin a single rotation across all workers so that +/// independently built per-fragment segments rotate vectors identically and their +/// binary codes remain comparable when merged. A driver calls this once and broadcasts +/// the resulting string to every `create_index_uncommitted(..., rabitq_model=...)` call. +/// +/// The rotation is always the "fast" rotation since its sign vector is JSON-serializable, +/// whereas the "matrix" rotation stores a dense matrix in a binary buffer that is dropped by +/// the JSON wire format. `dtype` is accepted for API symmetry but does not affect the fast +/// rotation. +/// +/// # Example (Python) +/// +/// ```python +/// from lance.lance import indices +/// +/// # Mint one model and broadcast `model` to every worker. +/// model = indices.build_rq_model(dimension=128, num_bits=1) +/// seg = ds.create_index_uncommitted( +/// column="vector", +/// index_type="IVF_RQ", +/// num_partitions=256, +/// ivf_centroids=centroids, +/// rabitq_model=model, +/// fragment_ids=my_fragments, +/// ) +/// ``` +#[pyfunction] +#[pyo3(signature = (dimension, num_bits=1, dtype="float32"))] +pub fn build_rq_model(dimension: usize, num_bits: u8, dtype: &str) -> PyResult { + use arrow::datatypes::{Float16Type, Float32Type, Float64Type}; + use lance_index::vector::bq::RQRotationType; + use lance_index::vector::bq::builder::RabitQuantizer; + use lance_index::vector::quantizer::Quantization; + + if !dimension.is_multiple_of(u8::BITS as usize) { + return Err(PyValueError::new_err( + "dimension must be divisible by 8 for IVF_RQ", + )); + } + let dim = dimension as i32; + let rotation = RQRotationType::Fast; + let quantizer = match dtype.to_lowercase().as_str() { + "float16" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + "float32" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + "float64" => RabitQuantizer::new_with_rotation::(num_bits, dim, rotation), + other => { + return Err(PyValueError::new_err(format!("unsupported dtype: {other}"))); + } + }; + serde_json::to_string(&quantizer.metadata(None)) + .map_err(|e| PyValueError::new_err(format!("failed to serialize RQ model: {e}"))) +} + #[allow(clippy::too_many_arguments)] async fn do_transform_vectors( dataset: &Dataset, @@ -696,6 +751,7 @@ pub fn register_indices(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let indices = PyModule::new(py, "indices")?; indices.add_wrapped(wrap_pyfunction!(train_ivf_model))?; indices.add_wrapped(wrap_pyfunction!(train_pq_model))?; + indices.add_wrapped(wrap_pyfunction!(build_rq_model))?; indices.add_wrapped(wrap_pyfunction!(transform_vectors))?; indices.add_wrapped(wrap_pyfunction!(shuffle_transformed_vectors))?; indices.add_wrapped(wrap_pyfunction!(load_shuffled_vectors))?; diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index 1df04d4b134..51439e2c905 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -14,6 +14,7 @@ use lance_core::{Error, Result}; use num_traits::Float; use serde::{Deserialize, Serialize}; +use crate::vector::bq::storage::RabitQuantizationMetadata; use crate::vector::quantizer::QuantizerBuildParams; pub mod builder; @@ -104,10 +105,16 @@ impl FromStr for RQRotationType { } } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug)] pub struct RQBuildParams { pub num_bits: u8, pub rotation_type: RQRotationType, + /// Optional pre-built rotation to reuse instead of generating a fresh random one. + /// + /// Distributed `IVF_RQ` builds mint one rotation and broadcast it so every segment + /// rotates vectors identically. This is transient build-time state and is never + /// persisted to the `RabitQuantization` params proto. + pub rotation: Option, } pub fn validate_rq_num_bits(num_bits: u8) -> Result<()> { @@ -155,6 +162,7 @@ impl RQBuildParams { Self { num_bits, rotation_type: RQRotationType::default(), + rotation: None, } } @@ -162,6 +170,7 @@ impl RQBuildParams { Self { num_bits, rotation_type, + rotation: None, } } } @@ -190,6 +199,7 @@ impl Default for RQBuildParams { Self { num_bits: 1, rotation_type: RQRotationType::default(), + rotation: None, } } } diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index f98c370aefc..70e084472d7 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -23,7 +23,7 @@ use crate::vector::bq::storage::{ use crate::vector::bq::transform::{ADD_FACTORS_FIELD, SCALE_FACTORS_FIELD}; use crate::vector::bq::{ RQBuildParams, RQRotationType, rabit_binary_code_bytes, - rotation::{apply_fast_rotation, random_fast_rotation_signs}, + rotation::{apply_fast_rotation, fast_rotation_signs_len, random_fast_rotation_signs}, validate_supported_rq_num_bits, }; use crate::vector::quantizer::{Quantization, Quantizer, QuantizerBuildParams}; @@ -329,6 +329,46 @@ impl Quantization for RabitQuantizer { )); } + // Reuse a supplied rotation instead of generating a fresh random one. + if let Some(meta) = ¶ms.rotation { + let expected_code_dim = dim * params.num_bits as usize; + if meta.num_bits != params.num_bits || meta.code_dim as usize != expected_code_dim { + return Err(Error::invalid_input(format!( + "supplied RaBitQ rotation does not match build params: rotation \ + num_bits={}, code_dim={}; expected num_bits={}, code_dim={}", + meta.num_bits, meta.code_dim, params.num_bits, expected_code_dim + ))); + } + + match meta.rotation_type { + RQRotationType::Fast => { + let signs = meta.fast_rotation_signs.as_ref().ok_or_else(|| { + Error::invalid_input("supplied fast RaBitQ rotation is missing signs") + })?; + let expected_len = fast_rotation_signs_len(meta.code_dim as usize); + if signs.len() != expected_len { + return Err(Error::invalid_input(format!( + "supplied fast RaBitQ rotation signs length {} does not match \ + expected {} for code_dim={}", + signs.len(), + expected_len, + meta.code_dim + ))); + } + } + RQRotationType::Matrix => { + if meta.rotate_mat.is_none() { + return Err(Error::invalid_input( + "use the fast rotation for distributed builds", + )); + } + } + } + return Ok(Self { + metadata: meta.clone(), + }); + } + let q = match data.as_fixed_size_list().value_type() { DataType::Float16 => Self::new_with_rotation::( params.num_bits, @@ -594,6 +634,124 @@ mod tests { ); } + fn sample_fsl(n: usize, dim: usize) -> FixedSizeListArray { + let values: Vec = (0..n * dim).map(|i| ((i * 31 % 17) as f32) - 8.0).collect(); + FixedSizeListArray::try_new_from_values(Float32Array::from(values), dim as i32).unwrap() + } + + fn quantized_codes(q: &RabitQuantizer, data: &FixedSizeListArray) -> Vec { + use arrow::datatypes::UInt8Type; + q.quantize(data) + .unwrap() + .as_fixed_size_list() + .values() + .as_primitive::() + .values() + .to_vec() + } + + #[test] + fn test_shared_fast_rotation_gives_identical_codes() { + let dim = 32; + let seed = RabitQuantizer::new_with_rotation::(1, dim, RQRotationType::Fast); + let json = serde_json::to_string(&seed.metadata(None)).unwrap(); + let meta: RabitQuantizationMetadata = serde_json::from_str(&json).unwrap(); + + let params = RQBuildParams { + num_bits: 1, + rotation_type: RQRotationType::Fast, + rotation: Some(meta), + }; + let data = sample_fsl(8, dim as usize); + let q_a = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap(); + let q_b = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap(); + + assert_eq!( + quantized_codes(&q_a, &data), + quantized_codes(&q_b, &data), + "shared rotation must yield identical codes" + ); + } + + #[test] + fn test_unpinned_rotation_gives_different_codes() { + let dim = 32; + let params = RQBuildParams::new(1); + let data = sample_fsl(8, dim as usize); + let q_a = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap(); + let q_b = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap(); + + assert_ne!( + quantized_codes(&q_a, &data), + quantized_codes(&q_b, &data), + "independent unpinned rotations must yield different codes" + ); + } + + #[test] + fn test_build_rejects_rotation_with_mismatched_code_dim() { + let seed = RabitQuantizer::new_with_rotation::(1, 16, RQRotationType::Fast); + let params = RQBuildParams { + num_bits: 1, + rotation_type: RQRotationType::Fast, + rotation: Some(seed.metadata(None)), + }; + let data = sample_fsl(4, 32); + let err = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap_err(); + assert!( + err.to_string().contains("does not match build params"), + "{}", + err + ); + } + + #[test] + fn test_build_rejects_fast_rotation_with_bad_signs_length() { + let dim = 16; + let seed = RabitQuantizer::new_with_rotation::(1, dim, RQRotationType::Fast); + let mut meta = seed.metadata(None); + // Corrupt the signs to the wrong length (valid would be 4 * ceil(16/8) = 8). + meta.fast_rotation_signs = Some(vec![0u8; 7]); + let params = RQBuildParams { + num_bits: 1, + rotation_type: RQRotationType::Fast, + rotation: Some(meta), + }; + let data = sample_fsl(4, dim as usize); + let err = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap_err(); + assert!(err.to_string().contains("signs length"), "{}", err); + } + + #[test] + fn test_matrix_rotation_lost_through_json_is_rejected() { + let dim = 16; + let seed = RabitQuantizer::new_with_rotation::(1, dim, RQRotationType::Matrix); + let meta = seed.metadata(None); + assert!(meta.rotate_mat.is_some()); + + let json = serde_json::to_string(&meta).unwrap(); + let parsed: RabitQuantizationMetadata = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed.rotation_type, RQRotationType::Matrix); + assert!( + parsed.rotate_mat.is_none(), + "matrix is expected to be dropped by JSON serialization" + ); + + let params = RQBuildParams { + num_bits: 1, + rotation_type: RQRotationType::Matrix, + rotation: Some(parsed), + }; + let data = sample_fsl(4, dim as usize); + let err = RabitQuantizer::build(&data, DistanceType::L2, ¶ms).unwrap_err(); + assert!( + err.to_string() + .contains("fast rotation for distributed builds"), + "{}", + err + ); + } + #[test] fn test_rabit_quantizer_rejects_unsupported_num_bits() { let vectors = Float32Array::from(vec![0.0f32; 4 * 32]); diff --git a/rust/lance-index/src/vector/bq/rotation.rs b/rust/lance-index/src/vector/bq/rotation.rs index 4f4895ac198..de8c8acccb3 100644 --- a/rust/lance-index/src/vector/bq/rotation.rs +++ b/rust/lance-index/src/vector/bq/rotation.rs @@ -138,9 +138,13 @@ fn sign_bytes_per_round(dim: usize) -> usize { dim.div_ceil(8) } +pub(crate) fn fast_rotation_signs_len(dim: usize) -> usize { + FAST_ROTATION_ROUNDS * sign_bytes_per_round(dim) +} + pub fn random_fast_rotation_signs(dim: usize) -> Vec { // Each round needs one random sign bit per dimension. - let mut signs = vec![0u8; FAST_ROTATION_ROUNDS * sign_bytes_per_round(dim)]; + let mut signs = vec![0u8; fast_rotation_signs_len(dim)]; rand::rng().fill_bytes(&mut signs); signs } diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index ff7d2383c67..04c9d31a0e8 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -1880,6 +1880,7 @@ fn derive_rabit_params(rabit_quantizer: &RabitQuantizer) -> RQBuildParams { RQBuildParams { num_bits: rabit_quantizer.num_bits(), rotation_type: rabit_quantizer.rotation_type(), + rotation: None, } } From 2a73364187bc9b1f027a49aa81ac43446c942ff8 Mon Sep 17 00:00:00 2001 From: Xin Sun Date: Thu, 4 Jun 2026 17:03:50 +0800 Subject: [PATCH 014/177] feat: add TOS object store support via OpenDAL (#7019) ## What changed Add Volcengine TOS (`tos://`) object store support through OpenDAL. - Register a TOS object store provider for `tos://bucket/path`. - Add the `tos` feature to `lance-io` and enable it by default through `rust/lance`. - Support `TOS_` / `VOLCENGINE_` environment variables and `tos_*` storage options. - Document TOS configuration. ## Testing Validated against a real Volcengine TOS object store. --- Cargo.lock | 31 ++ docs/src/guide/object_store.md | 30 ++ python/Cargo.lock | 31 ++ rust/lance-io/Cargo.toml | 2 + rust/lance-io/src/object_store.rs | 12 +- rust/lance-io/src/object_store/providers.rs | 5 + .../src/object_store/providers/tos.rs | 301 ++++++++++++++++++ rust/lance-io/tests/tos_integration.rs | 81 +++++ rust/lance/Cargo.toml | 3 +- 9 files changed, 493 insertions(+), 3 deletions(-) create mode 100644 rust/lance-io/src/object_store/providers/tos.rs create mode 100644 rust/lance-io/tests/tos_integration.rs diff --git a/Cargo.lock b/Cargo.lock index 37f106596bc..06aa1e2c5d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5996,6 +5996,7 @@ dependencies = [ "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -6213,6 +6214,23 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.1", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl" version = "0.10.80" @@ -7409,6 +7427,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d757602a7ef2b6025c0da77e6d2e23fbdef35930fa466b15ffbf0a3f13acf7" +dependencies = [ + "anyhow", + "http 1.4.1", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" diff --git a/docs/src/guide/object_store.md b/docs/src/guide/object_store.md index 1710e3b5100..182b93c0574 100644 --- a/docs/src/guide/object_store.md +++ b/docs/src/guide/object_store.md @@ -218,3 +218,33 @@ ds = lance.dataset( | `oss_secret_access_key` | Access key secret used for OSS authentication. Optional if credentials are provided by environment. | | `oss_region` | OSS region (for example, `cn-hangzhou`). Optional. | | `oss_security_token` | Security token for temporary credentials (STS). Optional. | + +## Volcengine TOS Configuration + +TOS credentials can be set in the environment variables `TOS_ACCESS_KEY_ID`, +`TOS_SECRET_ACCESS_KEY`, `TOS_ENDPOINT`, `TOS_REGION`, and `TOS_SECURITY_TOKEN`. +Lance also accepts the corresponding `VOLCENGINE_` environment variable prefix. +Alternatively, credentials can be passed as parameters to the `storage_options` +parameter; explicit `storage_options` override environment variables: + +```python +import lance +ds = lance.dataset( + "tos://bucket/path", + storage_options={ + "tos_endpoint": "https://tos-cn-beijing.volces.com", + "tos_region": "cn-beijing", + "tos_access_key_id": "my-access-key", + "tos_secret_access_key": "my-secret-key", + "tos_security_token": "my-session-token", + } +) +``` + +| Key | Description | +|-----|-------------| +| `tos_endpoint` | TOS endpoint. Required (for example, `https://tos-cn-beijing.volces.com`). | +| `tos_region` | TOS signing region (for example, `cn-beijing`). Optional. | +| `tos_access_key_id` | Access key ID used for TOS authentication. Optional if credentials are provided by environment. | +| `tos_secret_access_key` | Secret access key used for TOS authentication. Optional if credentials are provided by environment. | +| `tos_security_token` | Security token for temporary credentials. Optional. | diff --git a/python/Cargo.lock b/python/Cargo.lock index d1484c47f52..589346aff04 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -5334,6 +5334,7 @@ dependencies = [ "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -5551,6 +5552,23 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.1", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -6692,6 +6710,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d757602a7ef2b6025c0da77e6d2e23fbdef35930fa466b15ffbf0a3f13acf7" +dependencies = [ + "anyhow", + "http 1.4.1", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index d0c8a3c3c33..c3d01941968 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -74,6 +74,8 @@ azure = ["object_store/azure", "dep:opendal", "opendal/services-azblob", "openda oss = ["dep:opendal", "opendal/services-oss", "dep:object_store_opendal"] tencent = ["dep:opendal", "opendal/services-cos", "dep:object_store_opendal"] huggingface = ["dep:opendal", "opendal/services-huggingface", "dep:object_store_opendal"] +tos = ["dep:opendal", "opendal/services-tos", "dep:object_store_opendal"] +tos-test = ["tos"] test-util = [] [lints] diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 698378c5b9e..b1ba3d45d8a 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -36,7 +36,7 @@ use super::local::LocalObjectReader; use crate::uring::{UringCurrentThreadReader, UringReader}; #[cfg(any(feature = "aws", feature = "azure", feature = "gcp"))] pub(crate) mod dynamic_credentials; -#[cfg(any(feature = "oss", feature = "huggingface"))] +#[cfg(any(feature = "oss", feature = "huggingface", feature = "tos"))] pub(crate) mod dynamic_opendal; mod list_retry; pub mod providers; @@ -61,7 +61,15 @@ pub const DEFAULT_LOCAL_IO_PARALLELISM: usize = 8; pub const DEFAULT_CLOUD_IO_PARALLELISM: usize = 64; const DEFAULT_LOCAL_BLOCK_SIZE: usize = 4 * 1024; // 4KB block size -#[cfg(any(feature = "aws", feature = "gcp", feature = "azure"))] +#[cfg(any( + feature = "aws", + feature = "gcp", + feature = "azure", + feature = "oss", + feature = "tencent", + feature = "huggingface", + feature = "tos", +))] const DEFAULT_CLOUD_BLOCK_SIZE: usize = 64 * 1024; // 64KB block size pub static DEFAULT_MAX_IOP_SIZE: std::sync::LazyLock = std::sync::LazyLock::new(|| { diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 20fa251a0c5..aafe665cfe1 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -33,6 +33,8 @@ pub mod oss; pub mod shared_memory; #[cfg(feature = "tencent")] pub mod tencent; +#[cfg(feature = "tos")] +pub mod tos; #[async_trait::async_trait] pub trait ObjectStoreProvider: std::fmt::Debug + Sync + Send { @@ -95,6 +97,7 @@ pub struct ObjectStoreRegistryStats { /// - `s3+ddb`: An S3 object store with DynamoDB for metadata. /// - `az`: An Azure Blob Storage object store. /// - `gs`: A Google Cloud Storage object store. +/// - `tos`: A Volcengine TOS object store. /// /// Use [`Self::empty()`] to create an empty registry, with no providers registered. /// @@ -330,6 +333,8 @@ impl Default for ObjectStoreRegistry { providers.insert("cos".into(), Arc::new(tencent::TencentStoreProvider)); #[cfg(feature = "huggingface")] providers.insert("hf".into(), Arc::new(huggingface::HuggingfaceStoreProvider)); + #[cfg(feature = "tos")] + providers.insert("tos".into(), Arc::new(tos::TosStoreProvider)); Self { providers: RwLock::new(providers), active_stores: RwLock::new(HashMap::new()), diff --git a/rust/lance-io/src/object_store/providers/tos.rs b/rust/lance-io/src/object_store/providers/tos.rs new file mode 100644 index 00000000000..923186484c6 --- /dev/null +++ b/rust/lance-io/src/object_store/providers/tos.rs @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::sync::Arc; + +use object_store::ObjectStore as OSObjectStore; +use object_store_opendal::OpendalStore; +use opendal::{Operator, services::Tos}; +use url::Url; + +use crate::object_store::dynamic_opendal::DynamicOpenDalStore; +use crate::object_store::{ + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, ObjectStore, + ObjectStoreParams, ObjectStoreProvider, StorageOptions, +}; +use lance_core::error::{Error, Result}; + +#[derive(Default, Debug)] +pub struct TosStoreProvider; + +impl TosStoreProvider { + fn tos_env_options_from_iter(vars: I) -> HashMap + where + I: IntoIterator, + K: Into, + V: Into, + { + let vars = vars + .into_iter() + .map(|(key, value)| (key.into(), value.into())) + .collect::>(); + let mut config_map = HashMap::new(); + + for prefix in ["VOLCENGINE_", "TOS_"] { + for (key, value) in &vars { + if let Some(stripped_key) = key.strip_prefix(prefix) { + config_map.insert(stripped_key.to_ascii_lowercase(), value.clone()); + } + } + } + + config_map + } + + fn base_tos_options( + base_path: &Url, + storage_options: &StorageOptions, + ) -> Result> { + let bucket = base_path + .host_str() + .ok_or_else(|| Error::invalid_input("TOS URL must contain bucket name"))? + .to_string(); + + let prefix = base_path.path().trim_start_matches('/').to_string(); + + let mut config_map = Self::tos_env_options_from_iter(std::env::vars()); + + config_map.extend(storage_options.0.clone()); + + config_map.insert("bucket".to_string(), bucket); + if prefix.is_empty() { + config_map.remove("root"); + } else { + config_map.insert("root".to_string(), "/".to_string()); + } + + Ok(config_map) + } + + /// Normalize TOS storage options, resolving aliases for well-known keys + /// while passing through all other options so that OpenDAL can use them. + fn normalize_tos_config(options: &HashMap) -> Result> { + let mut config_map = options.clone(); + + let alias_groups: &[(&str, &[&str])] = &[ + ("endpoint", &["tos_endpoint"]), + ("region", &["tos_region"]), + ("access_key_id", &["tos_access_key_id"]), + ("secret_access_key", &["tos_secret_access_key"]), + ("security_token", &["tos_security_token"]), + ]; + + for (canonical, aliases) in alias_groups { + for alias in *aliases { + if let Some(value) = config_map.remove(*alias) { + config_map.insert(canonical.to_string(), value); + break; + } + } + } + + if !config_map.contains_key("endpoint") { + return Err(Error::invalid_input( + "TOS endpoint is required. Please provide 'tos_endpoint' in storage options or set TOS_ENDPOINT environment variable", + )); + } + + Ok(config_map) + } + + fn build_tos_store(config_map: HashMap) -> Result { + let operator = Operator::from_iter::(config_map) + .map_err(|e| Error::invalid_input(format!("Failed to create TOS operator: {:?}", e)))? + .finish(); + + Ok(OpendalStore::new(operator)) + } +} + +#[async_trait::async_trait] +impl ObjectStoreProvider for TosStoreProvider { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { + let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); + + let base_options = Self::base_tos_options(&base_path, &storage_options)?; + let accessor = params.get_accessor(); + + let inner: Arc = + if let Some(accessor) = accessor.filter(|a| a.has_provider()) { + Arc::new( + DynamicOpenDalStore::new( + format!("tos:{}", base_path), + base_options, + accessor, + Self::normalize_tos_config, + Self::build_tos_store, + ) + .with_protected_keys(["bucket", "root"]), + ) + } else { + Arc::new(Self::build_tos_store(Self::normalize_tos_config( + &base_options, + )?)?) + }; + + let mut url = base_path; + if !url.path().ends_with('/') { + url.set_path(&format!("{}/", url.path())); + } + + Ok(ObjectStore { + scheme: "tos".to_string(), + inner, + block_size, + max_iop_size: *DEFAULT_MAX_IOP_SIZE, + use_constant_size_upload_parts: params.use_constant_size_upload_parts, + list_is_lexically_ordered: params.list_is_lexically_ordered.unwrap_or(true), + io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, + download_retry_count: storage_options.download_retry_count(), + io_tracker: Default::default(), + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use super::TosStoreProvider; + use crate::object_store::dynamic_opendal::DynamicOpenDalStore; + use crate::object_store::test_utils::StaticMockStorageOptionsProvider; + use crate::object_store::{ObjectStoreProvider, StorageOptionsAccessor}; + use url::Url; + + #[test] + fn test_tos_store_path() { + let provider = TosStoreProvider; + + let url = Url::parse("tos://bucket/path/to/file").unwrap(); + let path = provider.extract_path(&url).unwrap(); + let expected_path = object_store::path::Path::from("path/to/file"); + assert_eq!(path, expected_path); + } + + #[test] + fn test_tos_env_options_normalize_supported_prefixes() { + let config = TosStoreProvider::tos_env_options_from_iter([ + ("VOLCENGINE_ENDPOINT", "https://tos-cn-beijing.volces.com"), + ("TOS_ACCESS_KEY_ID", "tos-akid"), + ("TOS_SECRET_ACCESS_KEY", "tos-secret"), + ]); + + assert_eq!( + config.get("endpoint").unwrap(), + "https://tos-cn-beijing.volces.com" + ); + assert_eq!(config.get("access_key_id").unwrap(), "tos-akid"); + assert_eq!(config.get("secret_access_key").unwrap(), "tos-secret"); + } + + #[test] + fn test_tos_alias_options_override_canonical_env_options() { + let config = TosStoreProvider::normalize_tos_config(&HashMap::from([ + ( + "endpoint".to_string(), + "https://env.example.com".to_string(), + ), + ( + "tos_endpoint".to_string(), + "https://user.example.com".to_string(), + ), + ("region".to_string(), "env-region".to_string()), + ("tos_region".to_string(), "user-region".to_string()), + ("access_key_id".to_string(), "env-akid".to_string()), + ("tos_access_key_id".to_string(), "user-akid".to_string()), + ("secret_access_key".to_string(), "env-secret".to_string()), + ( + "tos_secret_access_key".to_string(), + "user-secret".to_string(), + ), + ("security_token".to_string(), "env-token".to_string()), + ("tos_security_token".to_string(), "user-token".to_string()), + ("bucket".to_string(), "bucket".to_string()), + ])) + .unwrap(); + + assert_eq!(config.get("endpoint").unwrap(), "https://user.example.com"); + assert_eq!(config.get("region").unwrap(), "user-region"); + assert_eq!(config.get("access_key_id").unwrap(), "user-akid"); + assert_eq!(config.get("secret_access_key").unwrap(), "user-secret"); + assert_eq!(config.get("security_token").unwrap(), "user-token"); + assert!(!config.contains_key("tos_endpoint")); + assert!(!config.contains_key("tos_secret_access_key")); + assert!(!config.contains_key("tos_security_token")); + } + + #[test] + fn test_tos_url_bucket_and_root_are_authoritative() { + let storage_options = crate::object_store::StorageOptions(HashMap::from([ + ( + "tos_endpoint".to_string(), + "https://tos-cn-beijing.volces.com".to_string(), + ), + ("bucket".to_string(), "storage-options-bucket".to_string()), + ("root".to_string(), "/storage-options-root".to_string()), + ])); + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket/path").unwrap(), + &storage_options, + ) + .unwrap(); + let config = TosStoreProvider::normalize_tos_config(&base_options).unwrap(); + + assert_eq!(config.get("bucket").unwrap(), "url-bucket"); + assert_eq!(config.get("root").unwrap(), "/"); + + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket").unwrap(), + &storage_options, + ) + .unwrap(); + let config = TosStoreProvider::normalize_tos_config(&base_options).unwrap(); + + assert_eq!(config.get("bucket").unwrap(), "url-bucket"); + assert!(!config.contains_key("root")); + } + + #[tokio::test] + async fn test_dynamic_opendal_tos_store_uses_provider_credentials() { + let accessor = Arc::new(StorageOptionsAccessor::with_provider(Arc::new( + StaticMockStorageOptionsProvider { + options: HashMap::from([ + ( + "tos_endpoint".to_string(), + "https://tos-cn-beijing.volces.com".to_string(), + ), + ("tos_region".to_string(), "cn-beijing".to_string()), + ("tos_access_key_id".to_string(), "akid".to_string()), + ("tos_secret_access_key".to_string(), "secret".to_string()), + ("tos_security_token".to_string(), "token".to_string()), + ]), + }, + ))); + + let base_options = TosStoreProvider::base_tos_options( + &Url::parse("tos://url-bucket/path").unwrap(), + &crate::object_store::StorageOptions(HashMap::new()), + ) + .unwrap(); + + let store = DynamicOpenDalStore::new( + "tos", + base_options, + accessor, + TosStoreProvider::normalize_tos_config, + TosStoreProvider::build_tos_store, + ) + .with_protected_keys(["bucket", "root"]); + + let current_store = store + .current_store() + .await + .expect("dynamic OpenDAL TOS store should build"); + + assert!(current_store.to_string().contains("Opendal")); + } +} diff --git a/rust/lance-io/tests/tos_integration.rs b/rust/lance-io/tests/tos_integration.rs new file mode 100644 index 00000000000..97eca30090e --- /dev/null +++ b/rust/lance-io/tests/tos_integration.rs @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors +//! These integration tests can only be run against a real Volcengine TOS bucket. + +#![cfg(feature = "tos-test")] + +use futures::TryStreamExt; +use lance_io::object_store::ObjectStore; +use object_store::ObjectStoreExt; +use object_store::path::Path; +use tokio::io::AsyncWriteExt; + +fn tos_bucket() -> String { + std::env::var("TOS_BUCKET").expect("TOS_BUCKET must be set") +} + +async fn delete_prefix(store: &ObjectStore, prefix: &str) { + let prefix_path = Path::from(prefix); + let locations = store + .inner + .list(Some(&prefix_path)) + .map_ok(|meta| meta.location) + .try_collect::>() + .await + .unwrap_or_default(); + + for location in locations { + let _ = store.inner.delete(&location).await; + } +} + +#[ignore = "Must be run manually on Volcengine TOS"] +#[tokio::test] +async fn test_tos_write_read_list_delete() { + let prefix = format!("lance-tos-{}-{}", std::process::id(), rand::random::()); + let bucket = tos_bucket(); + let (store, base_path) = ObjectStore::from_uri(&format!("tos://{bucket}/{prefix}")) + .await + .unwrap(); + assert_eq!(base_path, Path::from(prefix.as_str())); + + let path = Path::from(format!("{prefix}/small.txt")); + delete_prefix(&store, &prefix).await; + + let result: Result<(), Box> = async { + let mut writer = store.create(&path).await?; + writer.write_all(b"hello").await?; + writer.write_all(b" tos").await?; + writer.shutdown().await?; + + let meta = store.inner.head(&path).await?; + if meta.size != 9 { + return Err(format!("expected object size 9, got {}", meta.size).into()); + } + + let data = store.inner.get(&path).await?.bytes().await?; + if data.as_ref() != b"hello tos" { + return Err("downloaded TOS object content did not match".into()); + } + + let listed = store + .inner + .list(Some(&Path::from(prefix.as_str()))) + .try_collect::>() + .await?; + if !listed.iter().any(|meta| meta.location == path) { + return Err("uploaded TOS object was not returned by list".into()); + } + + store.inner.delete(&path).await?; + if store.exists(&path).await? { + return Err("deleted TOS object still exists".into()); + } + + Ok(()) + } + .await; + + delete_prefix(&store, &prefix).await; + result.unwrap(); +} diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index e83353e9a63..95cc11e94a3 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -138,7 +138,7 @@ parquet = { version = "58", default-features = false, features = ["arrow", "asyn reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } [features] -default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "geo"] +default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "tos", "geo"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] @@ -158,6 +158,7 @@ azure = ["lance-io/azure"] oss = ["lance-io/oss"] tencent = ["lance-io/tencent"] huggingface = ["lance-io/huggingface"] +tos = ["lance-io/tos"] geo = ["lance-datafusion/geo", "lance-index/geo"] # Enable slow integration tests (disabled by default in CI) slow_tests = [] From 679ef3d4ecd0000896cd3a571b3301478b68a552 Mon Sep 17 00:00:00 2001 From: Xin Sun Date: Thu, 4 Jun 2026 17:04:27 +0800 Subject: [PATCH 015/177] fix: advance kmeans redo random init rng (#7074) KMeans redos created one RNG before the redo loop, but each random initialization cloned that same initial RNG state. As a result, redos greater than one could repeatedly start from the same randomly selected centroids instead of exploring distinct initializations. This reuses the RNG mutably across redo attempts so each random initialization consumes the advanced RNG state. --- rust/lance-index/src/vector/kmeans.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/rust/lance-index/src/vector/kmeans.rs b/rust/lance-index/src/vector/kmeans.rs index d0e86028b89..b11fb70bed0 100644 --- a/rust/lance-index/src/vector/kmeans.rs +++ b/rust/lance-index/src/vector/kmeans.rs @@ -800,14 +800,14 @@ impl KMeans { let mut adjusted_balance_factor = f32::MAX; // TODO: use seed for Rng. - let rng = SmallRng::from_os_rng(); + let mut rng = SmallRng::from_os_rng(); for redo in 1..=params.redos { let mut kmeans: Self = match ¶ms.init { KMeanInit::Random => Self::init_random::( data.values(), dimension, k, - rng.clone(), + &mut rng, params.distance_type, ), KMeanInit::Incremental(centroids) => Self::with_centroids( @@ -1592,6 +1592,21 @@ mod tests { assert_eq!(expected, actual); } + #[test] + fn test_random_init_advances_rng() { + let values = Float32Array::from_iter_values((0..64).map(|value| value as f32)); + let mut rng = SmallRng::seed_from_u64(42); + let first = + KMeans::init_random::(values.values(), 1, 8, &mut rng, DistanceType::L2); + let second = + KMeans::init_random::(values.values(), 1, 8, &mut rng, DistanceType::L2); + + assert_ne!( + first.centroids.as_primitive::().values(), + second.centroids.as_primitive::().values(), + ); + } + #[tokio::test] async fn test_compute_membership_and_loss() { const DIM: usize = 256; From 0d3f1250b8bd381c97e0548ad1fadcc7ef0b8af6 Mon Sep 17 00:00:00 2001 From: Xin Sun Date: Thu, 4 Jun 2026 17:05:07 +0800 Subject: [PATCH 016/177] fix(python): clamp target partition sizing (#7036) Fix Python target partition size inference to clamp the derived IVF partition count to `1..=4096`, matching the Rust path. The previous helper used `4096` as a lower bound, which produced oversized partition counts for small datasets and missed the upper bound for large datasets. --- python/python/lance/util.py | 2 +- python/python/tests/test_vector_index.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/python/lance/util.py b/python/python/lance/util.py index 5b94ad5c35a..2161c4e0d45 100644 --- a/python/python/lance/util.py +++ b/python/python/lance/util.py @@ -254,4 +254,4 @@ def _target_partition_size_to_num_partitions( if target_partition_size is None: target_partition_size = 8192 num_partitions = num_rows // target_partition_size - return max(1, num_partitions, 4096) + return min(max(1, num_partitions), 4096) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index d8f13a01da6..047b94bf8ac 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -23,7 +23,10 @@ from lance.dataset import VectorIndexReader from lance.indices import IndexFileVersion, IndicesBuilder from lance.query import MatchQuery, PhraseQuery -from lance.util import validate_vector_index # noqa: E402 +from lance.util import ( # noqa: E402 + _target_partition_size_to_num_partitions, + validate_vector_index, +) from lance.vector import vec_to_table # noqa: E402 @@ -856,6 +859,12 @@ def test_create_ivf_pq_with_target_partition_size(dataset, tmp_path): assert ann_ds.stats.index_stats("vector_idx")["indices"][0]["num_partitions"] == 2 +def test_target_partition_size_to_num_partitions_clamps(): + assert _target_partition_size_to_num_partitions(1000, 1000) == 1 + assert _target_partition_size_to_num_partitions(1000, 500) == 2 + assert _target_partition_size_to_num_partitions(8192 * 5000, 8192) == 4096 + + def test_index_size_stats(tmp_path: Path): num_rows = 512 dims = 32 From 49c0a625e51230137e538545eb82c277a2a938f1 Mon Sep 17 00:00:00 2001 From: ForwardXu Date: Thu, 4 Jun 2026 17:07:33 +0800 Subject: [PATCH 017/177] fix(lance-index): fix some flaky tests (#7052) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Fix three flaky tests ### 1. Fix flaky `test_simple_index_nearest_centroid::case_1_f16` (rust) The `test_simple_index_nearest_centroid::case_1_f16` was flaky because `HNSW` approximate search with ef=15 could not reliably find the nearest centroid when querying with 42.1f32 against f16-precision centroids. The `f16` cast to `f32` introduces subtle precision differences that alter the `HNSW` graph structure, causing the search to follow incorrect paths and return ID 45 instead of 42. Fix by using an exact match query value (42.0f32) for the f16 case, ensuring zero distance to the target centroid so `HNSW` always finds it. The f32 case retains the original 42.1f32 query. Fixes flaky test introduced in a57ec81. ### 2. Fix flaky `test_create_inverted_index_progress_callback_error_after_completion_is_ignored` (python) The test was failing because the `complete:write_metadata` progress event was being dispatched (and its callback error propagated) during the pump loop **before** the future completed — the future still had commit work to do after the builder emitted `stage_complete("write_metadata")`. The `block_on_pumping` function only ignores callback errors in the final pump **after** the future resolves. But since the event arrives in the channel before the commit step finishes, it gets drained in the loop where errors propagate. Fix by making `IndexProgressDispatcher::drain()` tolerate callback errors on `Complete`-type events. Complete events are purely informational — the stage's actual work is already done, so a callback failure should never abort the operation. `Start` and `Progress` events still propagate errors normally, preserving the "error before completion propagates" semantics. ### 3. Fix flaky `test_list_acquires_token_before_starting_underlying_stream` (rust) The test was flaky on Windows CI because it relies on real-time assertions with a 5ms timeout, but Windows system timer resolution (~15.6ms) makes such tight timing unreliable. The root cause is that `TokenBucketState` used `std::time::Instant` which is not controllable in tests. When the token bucket has no available tokens, the test asserts that `stream.next()` should block (timeout after 5ms), but on Windows the elapsed time measurement is too coarse. Fix by: - Switching `TokenBucketState.last_refill` from `std::time::Instant` to `tokio::time::Instant` - Adding `#[tokio::test(start_paused = true)]` to the two timing-sensitive list throttle tests - Adding `tokio = { workspace = true, features = ["test-util"] }` to dev-dependencies With `start_paused = true`, tokio fully controls time advancement, making the tests deterministic regardless of OS timer resolution. --- python/src/dataset.rs | 17 ++++++++++++-- rust/lance-index/src/vector/utils.rs | 8 +++---- rust/lance-io/Cargo.toml | 1 + rust/lance-io/src/object_store/throttle.rs | 27 ++++++++++++++-------- 4 files changed, 37 insertions(+), 16 deletions(-) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 1c7db74ddb4..8c16ca85399 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3609,7 +3609,7 @@ impl PyWriteDest { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] enum IndexProgressEventType { Start, Progress, @@ -3765,7 +3765,20 @@ impl IndexProgressDispatcher { fn drain(&mut self) -> PyResult<()> { while let Ok(event) = self.receiver.try_recv() { - self.dispatch(event)?; + let is_complete = event.event == IndexProgressEventType::Complete; + if let Err(err) = self.dispatch(event) { + if is_complete { + // Complete events are purely informational — the stage's work + // is already done. Propagating a callback error here would + // abort the operation after the real work has succeeded. + log::warn!( + "Ignoring progress callback error on stage-complete event: {}", + err + ); + } else { + return Err(err); + } + } } Ok(()) } diff --git a/rust/lance-index/src/vector/utils.rs b/rust/lance-index/src/vector/utils.rs index 1e56370613e..8307bd9edff 100644 --- a/rust/lance-index/src/vector/utils.rs +++ b/rust/lance-index/src/vector/utils.rs @@ -302,13 +302,13 @@ mod tests { #[rstest] #[case::f16(Arc::new(Float16Array::from( (0..100).flat_map(|i| std::iter::repeat_n(f16::from_f32(i as f32), 16)).collect::>(), - )) as ArrayRef)] + )) as ArrayRef, 42.0f32)] #[case::f32(Arc::new(Float32Array::from( (0..100).flat_map(|i| std::iter::repeat_n(i as f32, 16)).collect::>(), - )) as ArrayRef)] - fn test_simple_index_nearest_centroid(#[case] centroids: ArrayRef) { + )) as ArrayRef, 42.1f32)] + fn test_simple_index_nearest_centroid(#[case] centroids: ArrayRef, #[case] query_val: f32) { let index = build_index(centroids, 16); - let query: ArrayRef = Arc::new(Float32Array::from(vec![42.1f32; 16])); + let query: ArrayRef = Arc::new(Float32Array::from(vec![query_val; 16])); let (id, _) = index.search(query).unwrap(); assert_eq!(id, 42); } diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index c3d01941968..d1aabff3f7e 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -59,6 +59,7 @@ test-log.workspace = true mockall.workspace = true rstest.workspace = true mock_instant.workspace = true +tokio = { workspace = true, features = ["test-util"] } tracing-mock = { workspace = true } [[bench]] diff --git a/rust/lance-io/src/object_store/throttle.rs b/rust/lance-io/src/object_store/throttle.rs index dc66f69cf90..bac1d3a538e 100644 --- a/rust/lance-io/src/object_store/throttle.rs +++ b/rust/lance-io/src/object_store/throttle.rs @@ -318,7 +318,7 @@ impl AimdThrottleConfig { struct TokenBucketState { tokens: f64, - last_refill: std::time::Instant, + last_refill: tokio::time::Instant, rate: f64, } @@ -346,7 +346,7 @@ impl OperationThrottle { controller, bucket: Mutex::new(TokenBucketState { tokens: burst_capacity, - last_refill: std::time::Instant::now(), + last_refill: tokio::time::Instant::now(), rate: initial_rate, }), burst_capacity, @@ -364,7 +364,7 @@ impl OperationThrottle { async fn acquire_token(&self) { let sleep_duration = { let mut bucket = self.bucket.lock().await; - let now = std::time::Instant::now(); + let now = tokio::time::Instant::now(); let elapsed = now.duration_since(bucket.last_refill).as_secs_f64(); bucket.tokens = (bucket.tokens + elapsed * bucket.rate).min(self.burst_capacity); bucket.last_refill = now; @@ -1176,12 +1176,15 @@ mod tests { } fn list_start_throttle_config() -> AimdThrottleConfig { + // Use a low rate (10 tokens/s) so that the token-acquisition sleep is + // 1/10 = 100 ms — well above the 50 ms timeout used in assertions, + // avoiding flakiness from coarse OS timer resolution (e.g. Windows ~16 ms). AimdThrottleConfig::default() .with_burst_capacity(0) - .with_list_aimd(AimdConfig::default().with_initial_rate(50.0)) + .with_list_aimd(AimdConfig::default().with_initial_rate(10.0)) } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_list_acquires_token_before_starting_underlying_stream() { let store = Arc::new(CountingListStartStore::default()); store @@ -1199,14 +1202,16 @@ mod tests { let mut stream = throttled.list(Some(&Path::from("prefix"))); assert_eq!(store.list_calls(), 0); + // With rate=10 tokens/s and burst_capacity=0, the token acquisition + // sleeps for 100 ms. A 50 ms timeout must expire before that. assert!( - tokio::time::timeout(std::time::Duration::from_millis(5), stream.next()) + tokio::time::timeout(std::time::Duration::from_millis(50), stream.next()) .await .is_err() ); assert_eq!(store.list_calls(), 0); - let item = tokio::time::timeout(std::time::Duration::from_millis(100), stream.next()) + let item = tokio::time::timeout(std::time::Duration::from_millis(300), stream.next()) .await .unwrap() .unwrap() @@ -1215,7 +1220,7 @@ mod tests { assert_eq!(store.list_calls(), 1); } - #[tokio::test] + #[tokio::test(start_paused = true)] async fn test_list_with_offset_acquires_token_before_starting_underlying_stream() { let store = Arc::new(CountingListStartStore::default()); store @@ -1231,14 +1236,16 @@ mod tests { let mut stream = throttled.list_with_offset(Some(&Path::from("prefix")), &Path::from("prefix/a")); assert_eq!(store.offset_calls(), 0); + // With rate=10 tokens/s and burst_capacity=0, the token acquisition + // sleeps for 100 ms. A 50 ms timeout must expire before that. assert!( - tokio::time::timeout(std::time::Duration::from_millis(5), stream.next()) + tokio::time::timeout(std::time::Duration::from_millis(50), stream.next()) .await .is_err() ); assert_eq!(store.offset_calls(), 0); - let item = tokio::time::timeout(std::time::Duration::from_millis(100), stream.next()) + let item = tokio::time::timeout(std::time::Duration::from_millis(300), stream.next()) .await .unwrap() .unwrap() From 63c37e7fa5e7b48c922ece71be61eec8effb874e Mon Sep 17 00:00:00 2001 From: Prashanth Rao <35005448+prrao87@users.noreply.github.com> Date: Thu, 4 Jun 2026 12:31:58 -0400 Subject: [PATCH 018/177] docs: update ecosystem role for maintainer (#7102) Updating Zhang Yue's ecosystem roles. --- docs/src/community/maintainers.md | 64 +++++++++++++++---------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/src/community/maintainers.md b/docs/src/community/maintainers.md index 755201f069e..f3ba6e70304 100644 --- a/docs/src/community/maintainers.md +++ b/docs/src/community/maintainers.md @@ -40,38 +40,38 @@ Maintainers with GitHub write access are additionally encouraged to: ## Roster -| Name | GitHub Handle | Affiliation | GitHub Write Access | Ecosystem Roles | -|------------------------|----------------------|-------------------|---------------------|-------------------------------------------------| -| Wyatt Alt | wkalt | LanceDB | ✓ | | -| Matt Basta | mattbasta | Runway AI | | | -| Giuseppe Battista | giusedroid | AWS | | | -| Timothy Carambat | timothycarambat | Anything LLM | | | -| Ayush Chaurasia | AyushExel | LanceDB | | -| Chongchen Chen | chenkovsky | MiraclePlus | | | -| Akela Drissner-Schmid | akelad | dltHub | | | -| Ty Dunn | TyDunn | Continue | | | -| Enwei Jiao | jiaoew1991 | Luma.ai | ✓ | Milvus Maintainer | -| Bryan Keller | bryanck | Netflix | | Apache Iceberg Committer | -| Aman Kishore | AmanKishore | Harvey.ai | | | -| Sangwu Lee | RE-N-Y | Krea.ai | | | -| Jeremy Leibs | jleibs | Rerun.io | | | -| Haocheng Liu | HaochengLIU | Seven Research | ✓ | | -| Nathan Ma | majin1102 | ByteDance | ✓ | Apache Amoro (incubating) PPMC Member | -| ChanChan Mao | ccmao1130 | LanceDB | | | -| Lu Qiu | LuQQiu | LanceDB | ✓ | Alluxio PMC Member | -| Dan Rammer | hamersaw | LanceDB | ✓ | | -| Rong Rong | walterddr | Google DeepMind | | Apache Pinot PMC Member, Apache Flink Committer | -| Nat Roth | nrothGIT | Meta AI | | | -| Kevin Shaffer-Morrison | kevinshaffermorrison | AWS | | | -| Noah Shpak | noahshpak | Thinking Machines | | | -| Chunxu Tang | ChunxuTang | Google | | PrestoDB Committer | -| Ankit Vij | ankitvij-db | Databricks | | | -| Beinan Wang | beinan | Microsoft AI | | Alluxio PMC Member, Presto TSC Member | -| Jiacheng Yang | jiachengdb | Google AI | | | -| Yang Jie | LuciferYang | Baidu Inc. | | Apache Spark PMC Member, Apache Uniffle PMC Member | -| Jianjian Xie | jja725 | Uber | | | -| Zhang Yue | zhangyue19921010 | ByteDance | | | -| Jinglun | wojiaodoubao | ByteDance | | Apache Hadoop Committer | +| Name | GitHub Handle | Affiliation | GitHub Write Access | Ecosystem Roles | +|------------------------|----------------------|-------------------|---------------------|----------------------------------------------------| +| Wyatt Alt | wkalt | LanceDB | ✓ | | +| Matt Basta | mattbasta | Runway AI | | | +| Giuseppe Battista | giusedroid | AWS | | | +| Timothy Carambat | timothycarambat | Anything LLM | | | +| Ayush Chaurasia | AyushExel | LanceDB | | | +| Chongchen Chen | chenkovsky | MiraclePlus | | | +| Akela Drissner-Schmid | akelad | dltHub | | | +| Ty Dunn | TyDunn | Continue | | | +| Enwei Jiao | jiaoew1991 | Luma.ai | ✓ | Milvus Maintainer | +| Bryan Keller | bryanck | Netflix | | Apache Iceberg Committer | +| Aman Kishore | AmanKishore | Harvey.ai | | | +| Sangwu Lee | RE-N-Y | Krea.ai | | | +| Jeremy Leibs | jleibs | Rerun.io | | | +| Haocheng Liu | HaochengLIU | Seven Research | ✓ | | +| Nathan Ma | majin1102 | ByteDance | ✓ | Apache Amoro (incubating) PPMC Member | +| ChanChan Mao | ccmao1130 | LanceDB | | | +| Lu Qiu | LuQQiu | LanceDB | ✓ | Alluxio PMC Member | +| Dan Rammer | hamersaw | LanceDB | ✓ | | +| Rong Rong | walterddr | Google DeepMind | | Apache Pinot PMC Member, Apache Flink Committer | +| Nat Roth | nrothGIT | Meta AI | | | +| Kevin Shaffer-Morrison | kevinshaffermorrison | AWS | | | +| Noah Shpak | noahshpak | Thinking Machines | | | +| Chunxu Tang | ChunxuTang | Google | | PrestoDB Committer | +| Ankit Vij | ankitvij-db | Databricks | | | +| Beinan Wang | beinan | Microsoft AI | | Alluxio PMC Member, Presto TSC Member | +| Jiacheng Yang | jiachengdb | Google AI | | | +| Yang Jie | LuciferYang | Baidu Inc. | | Apache Spark PMC Member, Apache Uniffle PMC Member | +| Jianjian Xie | jja725 | Uber | | | +| Zhang Yue | zhangyue19921010 | ByteDance | | Apache Hudi PMC, Apache Druid Committer, Kafka Contributor | +| Jinglun | wojiaodoubao | ByteDance | | Apache Hadoop Committer | ## Becoming a Maintainer From f55d2087f83447d06dc5ee10c1d6cad1ec5cf0af Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 5 Jun 2026 00:32:36 +0800 Subject: [PATCH 019/177] feat!: return write summaries from file writers (#7096) FileWriter and IndexWriter now return write summaries from finish so callers can access the final object size without issuing an extra size lookup. This lets dataset fragment metadata and index writer callers propagate file sizes directly from the completed write path while keeping Python's existing LanceFileWriter.finish row-count behavior. --- python/src/file.rs | 6 ++- rust/lance-file/src/writer.rs | 30 ++++++++++--- rust/lance-index/src/scalar.rs | 14 +++++- .../src/scalar/inverted/builder.rs | 13 +++--- rust/lance-index/src/scalar/lance_format.rs | 44 ++++++++++++++----- rust/lance-index/src/scalar/ngram.rs | 6 ++- rust/lance-index/src/scalar/rtree.rs | 3 +- rust/lance/src/dataset/fragment/write.rs | 5 ++- rust/lance/src/dataset/write.rs | 9 ++-- 9 files changed, 94 insertions(+), 36 deletions(-) diff --git a/python/src/file.rs b/python/src/file.rs index ab5bda77fb0..b0bc20f9d0a 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -347,8 +347,10 @@ impl LanceFileWriter { } pub fn finish(&self) -> PyResult { - rt().block_on(None, async { self.inner.lock().await.finish().await })? - .infer_error() + rt().block_on(None, async { + self.inner.lock().await.finish().await.map(|s| s.num_rows) + })? + .infer_error() } pub fn add_global_buffer(&self, bytes: Vec) -> PyResult { diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 902c462c820..14a4c82bde6 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -49,6 +49,15 @@ const PAD_BUFFER: [u8; PAGE_BUFFER_ALIGNMENT] = [72; PAGE_BUFFER_ALIGNMENT]; const MAX_PAGE_BYTES: usize = 32 * 1024 * 1024; const ENV_LANCE_FILE_WRITER_MAX_PAGE_BYTES: &str = "LANCE_FILE_WRITER_MAX_PAGE_BYTES"; +/// Summary of a completed Lance file write. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FileWriteSummary { + /// The number of rows written to the file. + pub num_rows: u64, + /// The final size of the file in bytes. + pub size_bytes: u64, +} + #[derive(Debug, Clone, Default)] pub struct FileWriterOptions { /// How many bytes to use for buffering column data @@ -303,7 +312,7 @@ impl FileWriter { for batch in batches { writer.write_batch(&batch).await?; } - Ok(writer.finish().await? as usize) + Ok(writer.finish().await?.num_rows as usize) } async fn do_write_buffer(writer: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> Result<()> { @@ -755,8 +764,8 @@ impl FileWriter { /// will write the file metadata and the footer. It will not return until all /// data has been flushed and the file has been closed. /// - /// Returns the total number of rows written - pub async fn finish(&mut self) -> Result { + /// Returns a summary of the completed file write. + pub async fn finish(&mut self) -> Result { // 1. flush any remaining data and write out those pages let mut external_buffers = OutOfLineBuffers::new(self.tell().await?, PAGE_BUFFER_ALIGNMENT as u64); @@ -812,9 +821,12 @@ impl FileWriter { self.writer.write_all(MAGIC).await?; // 7. close the writer - Writer::shutdown(self.writer.as_mut()).await?; + let write_result = Writer::shutdown(self.writer.as_mut()).await?; - Ok(self.rows_written) + Ok(FileWriteSummary { + num_rows: self.rows_written, + size_bytes: write_result.size as u64, + }) } pub async fn abort(&mut self) { @@ -1581,8 +1593,12 @@ mod tests { .unwrap(); writer.write_batch(&batch).await.unwrap(); - let num_rows = writer.finish().await.unwrap(); - assert_eq!(num_rows, 2); + let write_summary = writer.finish().await.unwrap(); + assert_eq!(write_summary.num_rows, 2); + assert_eq!( + write_summary.size_bytes, + fs.object_store.size(&path).await.unwrap() + ); // Read back with split configuration let file_scheduler = fs diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 5ab138ff481..ac2efc26870 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -52,6 +52,13 @@ use lance_datafusion::udf::CONTAINS_TOKENS_UDF; pub const LANCE_SCALAR_INDEX: &str = "__lance_scalar_index"; +/// Summary of a completed index file write. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct IndexWriteSummary { + /// The final size of the index file in bytes. + pub size_bytes: u64, +} + /// Builtin index types supported by the Lance library /// /// This is primarily for convenience to avoid a bunch of string @@ -182,9 +189,12 @@ pub trait IndexWriter: Send { )) } /// Finishes writing the file and closes the file - async fn finish(&mut self) -> Result<()>; + async fn finish(&mut self) -> Result; /// Finishes writing the file and closes the file with additional metadata - async fn finish_with_metadata(&mut self, metadata: HashMap) -> Result<()>; + async fn finish_with_metadata( + &mut self, + metadata: HashMap, + ) -> Result; } /// Trait for reading an index (or parts of an index) from storage diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index b8b6b52c6af..7c347b50af3 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -2055,7 +2055,7 @@ mod tests { use super::*; use crate::metrics::NoOpMetricsCollector; use crate::progress::IndexBuildProgress; - use crate::scalar::{IndexFile, IndexReader, IndexWriter, ScalarIndex}; + use crate::scalar::{IndexFile, IndexReader, IndexWriteSummary, IndexWriter, ScalarIndex}; use arrow_array::{RecordBatch, StringArray, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; @@ -2242,12 +2242,15 @@ mod tests { Ok(1) } - async fn finish(&mut self) -> Result<()> { - Ok(()) + async fn finish(&mut self) -> Result { + Ok(IndexWriteSummary { size_bytes: 0 }) } - async fn finish_with_metadata(&mut self, _metadata: HashMap) -> Result<()> { - Ok(()) + async fn finish_with_metadata( + &mut self, + _metadata: HashMap, + ) -> Result { + Ok(IndexWriteSummary { size_bytes: 0 }) } } diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 9fcd888876d..3be82def542 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -3,7 +3,7 @@ //! Utilities for serializing and deserializing scalar indices in the lance format -use super::{IndexReader, IndexStore, IndexWriter}; +use super::{IndexReader, IndexStore, IndexWriteSummary, IndexWriter}; use arrow_array::RecordBatch; use arrow_schema::Schema; use async_trait::async_trait; @@ -109,14 +109,21 @@ impl IndexWriter for PreviousFileWrit Ok(offset as u64) } - async fn finish(&mut self) -> Result<()> { - Self::finish(self).await.map(|_| ()) + async fn finish(&mut self) -> Result { + Self::finish(self).await?; + Ok(IndexWriteSummary { + size_bytes: self.tell().await? as u64, + }) } - async fn finish_with_metadata(&mut self, metadata: HashMap) -> Result<()> { - Self::finish_with_metadata(self, &metadata) - .await - .map(|_| ()) + async fn finish_with_metadata( + &mut self, + metadata: HashMap, + ) -> Result { + Self::finish_with_metadata(self, &metadata).await?; + Ok(IndexWriteSummary { + size_bytes: self.tell().await? as u64, + }) } } @@ -132,15 +139,24 @@ impl IndexWriter for current_writer::FileWriter { Self::add_global_buffer(self, data).await } - async fn finish(&mut self) -> Result<()> { - Self::finish(self).await.map(|_| ()) + async fn finish(&mut self) -> Result { + let summary = Self::finish(self).await?; + Ok(IndexWriteSummary { + size_bytes: summary.size_bytes, + }) } - async fn finish_with_metadata(&mut self, metadata: HashMap) -> Result<()> { + async fn finish_with_metadata( + &mut self, + metadata: HashMap, + ) -> Result { metadata.into_iter().for_each(|(k, v)| { self.add_schema_metadata(k, v); }); - Self::finish(self).await.map(|_| ()) + let summary = Self::finish(self).await?; + Ok(IndexWriteSummary { + size_bytes: summary.size_bytes, + }) } } @@ -479,7 +495,11 @@ mod tests { .unwrap(); let expected = bytes::Bytes::from_static(b"scalar-global-buffer"); let buffer_idx = writer.add_global_buffer(expected.clone()).await.unwrap(); - writer.finish().await.unwrap(); + let write_summary = writer.finish().await.unwrap(); + let files = index_store.list_files_with_sizes().await.unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].path, "global-buffer.lance"); + assert_eq!(write_summary.size_bytes, files[0].size_bytes); let reader = index_store .open_index_file("global-buffer.lance") diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 5f7ce57da4a..748a8f5231c 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -1079,7 +1079,8 @@ impl NGramIndexBuilder { } } - writer.finish().await + writer.finish().await?; + Ok(()) } async fn merge_spill_files( @@ -1221,7 +1222,8 @@ impl NGramIndexBuilder { offset += batch_size; } - writer.finish().await + writer.finish().await?; + Ok(()) } } diff --git a/rust/lance-index/src/scalar/rtree.rs b/rust/lance-index/src/scalar/rtree.rs index 246c41ba006..e419c59baa6 100644 --- a/rust/lance-index/src/scalar/rtree.rs +++ b/rust/lance-index/src/scalar/rtree.rs @@ -885,7 +885,8 @@ impl RTreeIndexPlugin { )?; writer.write_record_batch(batch).await?; - writer.finish().await + writer.finish().await?; + Ok(()) } async fn train_rtree_index( diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index b10158224f9..9731be0c0eb 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -12,6 +12,7 @@ use lance_file::previous::writer::FileWriter as PreviousFileWriter; use lance_file::version::LanceFileVersion; use lance_file::writer::FileWriterOptions; use lance_io::object_store::ObjectStore; +use lance_io::utils::CachedFileSize; use lance_table::format::{DataFile, Fragment}; use lance_table::io::manifest::ManifestDescribing; use std::borrow::Cow; @@ -165,7 +166,8 @@ impl<'a> FragmentCreateBuilder<'a> { writer.write_batches(batch_chunk.iter()).await?; } - fragment.physical_rows = Some(writer.finish().await? as usize); + let write_summary = writer.finish().await?; + fragment.physical_rows = Some(write_summary.num_rows as usize); if matches!(fragment.physical_rows, Some(0)) { return Err(Error::invalid_input("Input data was empty.")); @@ -186,6 +188,7 @@ impl<'a> FragmentCreateBuilder<'a> { fragment.files[0].fields = field_ids; fragment.files[0].column_indices = column_indices; + fragment.files[0].file_size_bytes = CachedFileSize::new(write_summary.size_bytes); progress.complete(&fragment).await?; diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 8be2753cb96..1e73618fc6b 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1051,9 +1051,10 @@ where Ok(self.writer.tell().await? as u64) } async fn finish(&mut self) -> Result<(u32, DataFile)> { + let num_rows = self.writer.finish().await? as u32; let size_bytes = self.writer.tell().await?; Ok(( - self.writer.finish().await? as u32, + num_rows, DataFile::new_legacy( self.path.clone(), self.writer.schema(), @@ -1106,17 +1107,17 @@ impl GenericWriter for V2WriterAdapter { .map(|(_, column_index)| *column_index as i32) .collect::>(); let (major, minor) = self.writer.version().to_numbers(); - let num_rows = self.writer.finish().await? as u32; + let write_summary = self.writer.finish().await?; let data_file = DataFile::new( std::mem::take(&mut self.path), field_ids, column_indices, major, minor, - NonZero::new(self.writer.tell().await?), + NonZero::new(write_summary.size_bytes), self.base_id, ); - Ok((num_rows, data_file)) + Ok((write_summary.num_rows as u32, data_file)) } } From de176bd6175f5fea0fdf016a0554ee0fb819d50c Mon Sep 17 00:00:00 2001 From: Beinan Date: Thu, 4 Jun 2026 11:42:33 -0700 Subject: [PATCH 020/177] feat(index): implement FM-Index scalar index for exact substring search (#7026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation Enable exact substring search at scale for AI pretraining data decontamination — detecting benchmark contamination in trillion-row text corpora, following the [Infini-gram Mini paper](https://arxiv.org/abs/2506.12229). ## Summary - Implement FM-Index following the Infini-gram Mini paper architecture for exact substring search - Huffman-shaped wavelet tree for entropy-compressed BWT rank queries (~0.26N bytes) - Sampled suffix array (D=32) with LF-mapping locate for document resolution (~0.25N bytes) - Partitioned index (10K docs/partition) with blocked storage (32KB blocks) and lazy loading - Wire up `IndexType::FMIndex` in Lance's `create_index` and query paths (`contains()` filter) - Index size ~0.95x of text (paper claims 0.44x; gap is Lance row overhead per block) ## Benchmark (100K gitlake source code files, 1.59 GB text) | Metric | FM-Index | N-Gram | |--------|----------|--------| | Index size | 1,513 MB (0.95x) | 84 MB (0.05x) | | Build time | 132s | 9s | | Short queries (e.g. `fn `) | 9034ms/q | 448ms/q | | Medium queries (e.g. `fn main()`) | **29ms/q** | 480ms/q | | Long queries (~80 chars) | **34ms/q** | 206ms/q | FM-Index is 17x faster than N-Gram on medium queries and returns exact results (N-Gram returns approximate candidates needing recheck). N-Gram cannot find queries shorter than 3 characters (e.g. `fn ` returns 0). ## Test plan - [x] 9 unit tests covering search, locate, wavelet access, serialization, multi-document - [x] End-to-end benchmark through Lance dataset API (`dataset.create_index`, `dataset.count_rows(filter)`) - [x] Verified correct match counts against full-scan baseline on real source code 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Beinan Wang --- Cargo.lock | 10 + java/lance-jni/src/blocking_dataset.rs | 1 + protos/index.proto | 4 +- python/Cargo.lock | 10 + rust/lance-index/Cargo.toml | 1 + rust/lance-index/src/lib.rs | 13 +- rust/lance-index/src/registry.rs | 6 +- rust/lance-index/src/scalar.rs | 4 + rust/lance-index/src/scalar/fmindex.rs | 2157 ++++++++++++++++++++++++ rust/lance/src/index.rs | 1 + rust/lance/src/index/create.rs | 1 + 11 files changed, 2204 insertions(+), 4 deletions(-) create mode 100644 rust/lance-index/src/scalar/fmindex.rs diff --git a/Cargo.lock b/Cargo.lock index 06aa1e2c5d9..a03cd1f05bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4857,6 +4857,7 @@ dependencies = [ "lance-testing", "lance-tokenizer", "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -5241,6 +5242,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "lindera" version = "3.0.7" diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index cd43c69d61a..935de3e8a35 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -974,6 +974,7 @@ fn inner_create_index<'local>( | IndexType::NGram | IndexType::ZoneMap | IndexType::BloomFilter + | IndexType::FMIndex | IndexType::RTree => { // For scalar indices, create a scalar IndexParams let (index_type_str, params_opt) = get_scalar_index_params(env, params_jobj)?; diff --git a/protos/index.proto b/protos/index.proto index ea21c70387d..b1045f8977c 100644 --- a/protos/index.proto +++ b/protos/index.proto @@ -246,4 +246,6 @@ message JsonIndexDetails { } message BloomFilterIndexDetails {} -message RTreeIndexDetails {} \ No newline at end of file +message RTreeIndexDetails {} + +message FMIndexIndexDetails {} \ No newline at end of file diff --git a/python/Cargo.lock b/python/Cargo.lock index 589346aff04..de95b2730dc 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4424,6 +4424,7 @@ dependencies = [ "lance-table", "lance-tokenizer", "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -4757,6 +4758,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "lindera" version = "3.0.7" diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 5ff94574a70..07d74760b75 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -45,6 +45,7 @@ lance-encoding.workspace = true lance-file.workspace = true lance-geo = { workspace = true, optional = true } lance-io.workspace = true +libsais-rs = "0.2" lance-linalg.workspace = true lance-select.workspace = true lance-tokenizer.workspace = true diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 0ed6ddd4e2d..fee421bab08 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -125,6 +125,8 @@ pub enum IndexType { RTree = 10, // RTree + FMIndex = 11, // FM-Index + // 100+ and up for vector index. /// Flat vector index. Vector = 100, // Legacy vector index, alias to IvfPq @@ -150,6 +152,7 @@ impl std::fmt::Display for IndexType { Self::ZoneMap => write!(f, "ZoneMap"), Self::BloomFilter => write!(f, "BloomFilter"), Self::RTree => write!(f, "RTree"), + Self::FMIndex => write!(f, "FMIndex"), Self::Vector | Self::IvfPq => write!(f, "IVF_PQ"), Self::IvfFlat => write!(f, "IVF_FLAT"), Self::IvfSq => write!(f, "IVF_SQ"), @@ -177,6 +180,7 @@ impl TryFrom for IndexType { v if v == Self::ZoneMap as i32 => Ok(Self::ZoneMap), v if v == Self::BloomFilter as i32 => Ok(Self::BloomFilter), v if v == Self::RTree as i32 => Ok(Self::RTree), + v if v == Self::FMIndex as i32 => Ok(Self::FMIndex), v if v == Self::Vector as i32 => Ok(Self::Vector), v if v == Self::IvfFlat as i32 => Ok(Self::IvfFlat), v if v == Self::IvfSq as i32 => Ok(Self::IvfSq), @@ -205,6 +209,7 @@ impl TryFrom<&str> for IndexType { "ZoneMap" | "ZONEMAP" => Ok(Self::ZoneMap), "BloomFilter" | "BLOOMFILTER" | "BLOOM_FILTER" => Ok(Self::BloomFilter), "RTree" | "RTREE" | "R_TREE" => Ok(Self::RTree), + "FMIndex" | "FMINDEX" | "FM_INDEX" => Ok(Self::FMIndex), "Vector" | "VECTOR" => Ok(Self::Vector), "IVF_FLAT" => Ok(Self::IvfFlat), "IVF_SQ" => Ok(Self::IvfSq), @@ -235,7 +240,8 @@ impl IndexType { | Self::NGram | Self::ZoneMap | Self::BloomFilter - | Self::RTree, + | Self::RTree + | Self::FMIndex, ) } @@ -275,6 +281,7 @@ impl IndexType { Self::ZoneMap => 0, Self::BloomFilter => 0, Self::RTree => 0, + Self::FMIndex => 0, // IMPORTANT: if any vector index subtype needs a format bump that is // not backward compatible, its new version must be set to @@ -389,6 +396,7 @@ mod tests { IndexType::ZoneMap, IndexType::BloomFilter, IndexType::RTree, + IndexType::FMIndex, IndexType::Vector, IndexType::IvfFlat, IndexType::IvfSq, @@ -430,6 +438,9 @@ mod tests { ("RTree", IndexType::RTree), ("RTREE", IndexType::RTree), ("R_TREE", IndexType::RTree), + ("FMIndex", IndexType::FMIndex), + ("FMINDEX", IndexType::FMIndex), + ("FM_INDEX", IndexType::FMIndex), ("Vector", IndexType::Vector), ("VECTOR", IndexType::Vector), ("IVF_FLAT", IndexType::IvfFlat), diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 8ab65d38896..1608baec8e6 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -10,8 +10,9 @@ use crate::{ pb, pbold, scalar::{ bitmap::BitmapIndexPlugin, bloomfilter::BloomFilterIndexPlugin, btree::BTreeIndexPlugin, - inverted::InvertedIndexPlugin, json::JsonIndexPlugin, label_list::LabelListIndexPlugin, - ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, zonemap::ZoneMapIndexPlugin, + fmindex::FMIndexPlugin, inverted::InvertedIndexPlugin, json::JsonIndexPlugin, + label_list::LabelListIndexPlugin, ngram::NGramIndexPlugin, registry::ScalarIndexPlugin, + zonemap::ZoneMapIndexPlugin, }, }; @@ -66,6 +67,7 @@ impl IndexPluginRegistry { registry.add_plugin::(); registry.add_plugin::(); registry.add_plugin::(); + registry.add_plugin::(); #[cfg(feature = "geo")] registry.add_plugin::(); diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index ac2efc26870..52a64d7d6c9 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -35,6 +35,7 @@ pub mod bitmap; pub mod bloomfilter; pub mod btree; pub mod expression; +pub mod fmindex; pub mod inverted; pub mod json; pub mod label_list; @@ -74,6 +75,7 @@ pub enum BuiltinIndexType { BloomFilter, RTree, Inverted, + FMIndex, } impl BuiltinIndexType { @@ -87,6 +89,7 @@ impl BuiltinIndexType { Self::Inverted => "inverted", Self::BloomFilter => "bloomfilter", Self::RTree => "rtree", + Self::FMIndex => "fmindex", } } } @@ -104,6 +107,7 @@ impl TryFrom for BuiltinIndexType { IndexType::Inverted => Ok(Self::Inverted), IndexType::BloomFilter => Ok(Self::BloomFilter), IndexType::RTree => Ok(Self::RTree), + IndexType::FMIndex => Ok(Self::FMIndex), _ => Err(Error::index("Invalid index type".to_string())), } } diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs new file mode 100644 index 00000000000..331be04c538 --- /dev/null +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -0,0 +1,2157 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! FM-Index for exact substring search (following the Infini-gram Mini paper) +//! +//! The FM-Index is a compressed full-text index based on the Burrows-Wheeler Transform (BWT). +//! It supports exact substring matching via backward search and returns exact row ids. +//! +//! Architecture (matching the paper): +//! - Huffman-shaped Wavelet Tree over BWT for entropy-compressed rank queries (~0.26N) +//! - Sampled Suffix Array every D-th position for locate (~N/D × 8 bytes) +//! - doc_start_positions for mapping text positions to documents (tiny) +//! - No doc_array — documents are resolved via SA sampling + LF-mapping + binary search +//! +//! Total index size: ~0.44N (matching paper's claim) +//! +//! Storage layout (v10 - blocked, partitioned): +//! - BWT wavelet tree bitvectors in blocks of BLOCK_WORDS (32KB each) +//! - SA samples stored as packed binary blocks after wavelet blocks +//! - Row IDs and doc_start_positions in metadata +//! - File metadata: c_table, huffman_codes, tree topology + +use std::cmp::Reverse; +use std::collections::{BinaryHeap, HashMap}; +use std::sync::{Arc, OnceLock}; + +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field}; +use async_trait::async_trait; +use datafusion::execution::SendableRecordBatchStream; +use deepsize::DeepSizeOf; +use futures::StreamExt; +use lance_core::cache::LanceCache; +use lance_core::{Error, Result}; +use roaring::RoaringBitmap; + +use crate::frag_reuse::FragReuseIndex; +use crate::metrics::MetricsCollector; +use crate::pb; +use crate::scalar::expression::{ScalarQueryParser, TextQueryParser}; +use crate::scalar::registry::{ + DefaultTrainingRequest, ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, +}; +use crate::scalar::{ + AnyQuery, BuiltinIndexType, CreatedIndex, IndexStore, OldIndexDataFilter, ScalarIndex, + ScalarIndexParams, SearchResult, TextQuery, UpdateCriteria, +}; +use crate::vector::VectorIndex; +use crate::{Index, IndexType}; + +const FMINDEX_INDEX_VERSION: u32 = 10; +const BLOCK_WORDS: usize = 4096; +const PARTITION_SIZE: usize = 10_000; +const SENTINEL_BYTE: u8 = 0xFF; + +/// SA sampling rate. Store every D-th SA entry. Locate walks at most D LF steps. +const SA_SAMPLE_RATE: usize = 32; + +fn fmindex_partition_path(partition_id: u64) -> String { + format!("part_{partition_id}_fmindex.lance") +} + +// ── Bitvector with O(1) rank ───────────────────────────────────────────────── + +const SUPERBLOCK_BITS: usize = 512; +const WORDS_PER_SUPERBLOCK: usize = SUPERBLOCK_BITS / 64; + +#[derive(Debug, Clone)] +struct RankBitVec { + words: Vec, + superblocks: Vec, + len: usize, +} + +#[allow(dead_code)] +impl RankBitVec { + fn new(len: usize) -> Self { + Self { + words: vec![0u64; len.div_ceil(64)], + superblocks: Vec::new(), + len, + } + } + + #[inline] + fn set(&mut self, pos: usize) { + self.words[pos / 64] |= 1u64 << (pos % 64); + } + + #[inline] + fn get(&self, pos: usize) -> bool { + (self.words[pos / 64] >> (pos % 64)) & 1 != 0 + } + + fn build_rank_index(&mut self) { + let num_sb = self.words.len().div_ceil(WORDS_PER_SUPERBLOCK) + 1; + self.superblocks = Vec::with_capacity(num_sb); + let mut cum = 0u32; + for (i, chunk) in self.words.chunks(WORDS_PER_SUPERBLOCK).enumerate() { + self.superblocks.push(if i == 0 { 0 } else { cum }); + for &w in chunk { + cum += w.count_ones(); + } + } + self.superblocks.push(cum); + } + + #[inline] + fn rank1(&self, pos: usize) -> usize { + if pos == 0 { + return 0; + } + let word_idx = pos / 64; + let bit_idx = pos % 64; + let sb_idx = word_idx / WORDS_PER_SUPERBLOCK; + let mut count = self.superblocks[sb_idx] as usize; + for i in (sb_idx * WORDS_PER_SUPERBLOCK)..word_idx { + count += self.words[i].count_ones() as usize; + } + if bit_idx > 0 { + count += (self.words[word_idx] & ((1u64 << bit_idx) - 1)).count_ones() as usize; + } + count + } + + #[inline] + fn rank0(&self, pos: usize) -> usize { + pos - self.rank1(pos) + } + + fn deep_size(&self) -> usize { + self.words.len() * 8 + self.superblocks.len() * 4 + } +} + +// ── Huffman-shaped Wavelet Tree ────────────────────────────────────────────── + +#[derive(Debug, Clone, Default)] +struct HuffmanCode { + bits: u32, + length: u8, + node_path: Vec, +} + +#[derive(Debug, Clone)] +enum WaveletChild { + Node(usize), + Leaf(u8), +} + +#[derive(Debug, Clone)] +struct HuffmanWaveletTree { + nodes: Vec, + codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + len: usize, +} + +#[derive(Debug)] +enum HuffNode { + Leaf(u8), + Internal { left: Box, right: Box }, +} + +impl PartialEq for HuffNode { + fn eq(&self, _: &Self) -> bool { + true + } +} +impl Eq for HuffNode {} +impl PartialOrd for HuffNode { + fn partial_cmp(&self, o: &Self) -> Option { + Some(self.cmp(o)) + } +} +impl Ord for HuffNode { + fn cmp(&self, _: &Self) -> std::cmp::Ordering { + std::cmp::Ordering::Equal + } +} + +#[allow(dead_code)] +impl HuffmanWaveletTree { + fn build(data: &[u8]) -> Self { + let n = data.len(); + if n == 0 { + return Self { + nodes: Vec::new(), + codes: std::array::from_fn(|_| HuffmanCode::default()), + children: Vec::new(), + len: 0, + }; + } + + let mut freq = [0u64; 256]; + for &b in data { + freq[b as usize] += 1; + } + + let mut heap: BinaryHeap<(Reverse, Reverse, Box)> = BinaryHeap::new(); + let mut tie = 0; + for (v, &f) in freq.iter().enumerate() { + if f > 0 { + heap.push((Reverse(f), Reverse(tie), Box::new(HuffNode::Leaf(v as u8)))); + tie += 1; + } + } + if heap.len() == 1 { + let (f, _, node) = heap.pop().unwrap(); + heap.push((Reverse(0), Reverse(tie), Box::new(HuffNode::Leaf(255)))); + tie += 1; + heap.push((f, Reverse(tie), node)); + tie += 1; + } + while heap.len() > 1 { + let (Reverse(f1), _, l) = heap.pop().unwrap(); + let (Reverse(f2), _, r) = heap.pop().unwrap(); + heap.push(( + Reverse(f1 + f2), + Reverse(tie), + Box::new(HuffNode::Internal { left: l, right: r }), + )); + tie += 1; + } + let root = heap.pop().unwrap().2; + + let mut codes: [HuffmanCode; 256] = std::array::from_fn(|_| HuffmanCode::default()); + let mut node_count = 0; + let mut children_map: Vec<(WaveletChild, WaveletChild)> = Vec::new(); + + fn assign( + node: &HuffNode, + bits: u32, + len: u8, + path: &mut Vec, + nid: &mut usize, + codes: &mut [HuffmanCode; 256], + cm: &mut Vec<(WaveletChild, WaveletChild)>, + ) -> WaveletChild { + match node { + HuffNode::Leaf(b) => { + codes[*b as usize] = HuffmanCode { + bits, + length: len, + node_path: path.clone(), + }; + WaveletChild::Leaf(*b) + } + HuffNode::Internal { left, right } => { + let my = *nid; + *nid += 1; + path.push(my); + cm.push((WaveletChild::Leaf(0), WaveletChild::Leaf(0))); + let lc = assign(left, bits << 1, len + 1, path, nid, codes, cm); + let rc = assign(right, (bits << 1) | 1, len + 1, path, nid, codes, cm); + cm[my] = (lc, rc); + path.pop(); + WaveletChild::Node(my) + } + } + } + assign( + &root, + 0, + 0, + &mut Vec::new(), + &mut node_count, + &mut codes, + &mut children_map, + ); + + let mut node_sizes = vec![0usize; node_count]; + for &b in data { + for &nid in &codes[b as usize].node_path { + node_sizes[nid] += 1; + } + } + let mut nodes: Vec = node_sizes.iter().map(|&sz| RankBitVec::new(sz)).collect(); + let mut cursors = vec![0usize; node_count]; + for &b in data { + let code = &codes[b as usize]; + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 1 { + nodes[nid].set(cursors[nid]); + } + cursors[nid] += 1; + } + } + for n in &mut nodes { + n.build_rank_index(); + } + Self { + nodes, + codes, + children: children_map, + len: n, + } + } + + /// Retrieve the byte at position `pos` in the original BWT. + #[inline] + fn access(&self, mut pos: usize) -> u8 { + if self.nodes.is_empty() { + return 0; + } + let mut node_idx = 0; + loop { + let bit = self.nodes[node_idx].get(pos); + let (ref left, ref right) = self.children[node_idx]; + if bit { + pos = self.nodes[node_idx].rank1(pos); + match right { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } else { + pos = self.nodes[node_idx].rank0(pos); + match left { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } + } + } + + /// Count occurrences of byte `c` in positions `[0, pos)`. + #[inline] + fn rank(&self, c: u8, pos: usize) -> usize { + let code = &self.codes[c as usize]; + if code.length == 0 { + return 0; + } + let (mut lo, mut hi) = (0, pos); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + lo = self.nodes[nid].rank0(lo); + hi = self.nodes[nid].rank0(hi); + } else { + lo = self.nodes[nid].rank1(lo); + hi = self.nodes[nid].rank1(hi); + } + } + hi - lo + } + + #[inline] + fn rank_pair(&self, c: u8, lo: usize, hi: usize) -> (usize, usize) { + let code = &self.codes[c as usize]; + if code.length == 0 { + return (0, 0); + } + let (mut s, mut l, mut h) = (0, lo, hi); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + s = self.nodes[nid].rank0(s); + l = self.nodes[nid].rank0(l); + h = self.nodes[nid].rank0(h); + } else { + s = self.nodes[nid].rank1(s); + l = self.nodes[nid].rank1(l); + h = self.nodes[nid].rank1(h); + } + } + (l - s, h - s) + } + + fn deep_size(&self) -> usize { + self.nodes.iter().map(|n| n.deep_size()).sum::() + + self + .codes + .iter() + .map(|c| c.node_path.len() * 8) + .sum::() + + self.children.len() * 24 + } +} + +// ── Suffix Array ───────────────────────────────────────────────────────────── + +fn build_suffix_array(text: &[u8]) -> Vec { + let n = text.len(); + if n == 0 { + return Vec::new(); + } + if n > i32::MAX as usize { + let mut sa = vec![0i64; n]; + assert_eq!(libsais_rs::libsais64(text, &mut sa, 0, None), 0); + sa.iter().map(|&x| x as usize).collect() + } else { + let mut sa = vec![0i32; n]; + assert_eq!(libsais_rs::libsais(text, &mut sa, 0, None), 0); + sa.iter().map(|&x| x as usize).collect() + } +} + +// ── Lazy Block Loading ─────────────────────────────────────────────────────── + +const BLOCK_BITS: usize = BLOCK_WORDS * 64; + +struct LazyRankBitVec { + prefix_ranks: Vec, + blocks: Vec>>, + reader: Arc, + block_row_offset: usize, + len: usize, +} + +impl std::fmt::Debug for LazyRankBitVec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyRankBitVec") + .field("len", &self.len) + .finish() + } +} + +impl LazyRankBitVec { + fn new( + prefix_ranks: Vec, + num_blocks: usize, + reader: Arc, + offset: usize, + len: usize, + ) -> Self { + Self { + prefix_ranks, + blocks: (0..num_blocks).map(|_| OnceLock::new()).collect(), + reader, + block_row_offset: offset, + len, + } + } + + /// Pre-load all blocks into memory. Call this before sync rank/access operations + /// to avoid the need for `block_in_place` during queries. + async fn load_all_blocks(&self) -> Result<()> { + for (idx, lock) in self.blocks.iter().enumerate() { + if lock.get().is_none() { + let words = self.load_block(idx).await?; + let _ = lock.set(words); + } + } + Ok(()) + } + + #[inline] + fn ensure_block(&self, idx: usize) -> &[u64] { + self.blocks[idx].get_or_init(|| { + tokio::task::block_in_place(|| { + tokio::runtime::Handle::current().block_on(self.load_block(idx)) + }) + .unwrap_or_else(|e| panic!("FM-Index block load failed: {e}")) + }) + } + + async fn load_block(&self, idx: usize) -> Result> { + let row = self.block_row_offset + idx; + let batch = self + .reader + .read_range(row..row + 1, Some(&["words"])) + .await?; + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("expected LargeBinary words column"))?; + Ok(col + .value(0) + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect()) + } + + #[inline] + fn rank1(&self, pos: usize) -> usize { + if pos == 0 { + return 0; + } + let bi = pos / BLOCK_BITS; + let local = pos % BLOCK_BITS; + if local == 0 { + return self.prefix_ranks[bi] as usize; + } + let mut count = self.prefix_ranks[bi] as usize; + let block = self.ensure_block(bi); + let wi = local / 64; + let bit = local % 64; + for w in &block[..wi] { + count += w.count_ones() as usize; + } + if bit > 0 { + count += (block[wi] & ((1u64 << bit) - 1)).count_ones() as usize; + } + count + } + + #[inline] + fn rank0(&self, pos: usize) -> usize { + pos - self.rank1(pos) + } + + #[inline] + fn get(&self, pos: usize) -> bool { + let bi = pos / BLOCK_BITS; + let local = pos % BLOCK_BITS; + let block = self.ensure_block(bi); + (block[local / 64] >> (local % 64)) & 1 != 0 + } + + fn deep_size(&self) -> usize { + let loaded: usize = self + .blocks + .iter() + .filter_map(|b| b.get()) + .map(|w| w.len() * 8) + .sum(); + self.prefix_ranks.len() * 8 + loaded + } +} + +struct LazyHuffmanWaveletTree { + nodes: Vec, + codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + len: usize, +} + +impl std::fmt::Debug for LazyHuffmanWaveletTree { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyHuffmanWaveletTree") + .field("len", &self.len) + .finish() + } +} + +impl LazyHuffmanWaveletTree { + /// Pre-load all wavelet tree blocks into memory. + async fn load_all(&self) -> Result<()> { + for node in &self.nodes { + node.load_all_blocks().await?; + } + Ok(()) + } + + #[inline] + fn access(&self, mut pos: usize) -> u8 { + if self.nodes.is_empty() { + return 0; + } + let mut node_idx = 0; + loop { + let bit = self.nodes[node_idx].get(pos); + let (ref left, ref right) = self.children[node_idx]; + if bit { + pos = self.nodes[node_idx].rank1(pos); + match right { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } else { + pos = self.nodes[node_idx].rank0(pos); + match left { + WaveletChild::Leaf(b) => return *b, + WaveletChild::Node(next) => node_idx = *next, + } + } + } + } + + #[inline] + fn rank(&self, c: u8, pos: usize) -> usize { + let code = &self.codes[c as usize]; + if code.length == 0 { + return 0; + } + let (mut lo, mut hi) = (0, pos); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + lo = self.nodes[nid].rank0(lo); + hi = self.nodes[nid].rank0(hi); + } else { + lo = self.nodes[nid].rank1(lo); + hi = self.nodes[nid].rank1(hi); + } + } + hi - lo + } + + #[inline] + fn rank_pair(&self, c: u8, lo: usize, hi: usize) -> (usize, usize) { + let code = &self.codes[c as usize]; + if code.length == 0 { + return (0, 0); + } + let (mut s, mut l, mut h) = (0, lo, hi); + for (level, &nid) in code.node_path.iter().enumerate() { + if (code.bits >> (code.length - 1 - level as u8)) & 1 == 0 { + s = self.nodes[nid].rank0(s); + l = self.nodes[nid].rank0(l); + h = self.nodes[nid].rank0(h); + } else { + s = self.nodes[nid].rank1(s); + l = self.nodes[nid].rank1(l); + h = self.nodes[nid].rank1(h); + } + } + (l - s, h - s) + } + + fn deep_size(&self) -> usize { + self.nodes.iter().map(|n| n.deep_size()).sum::() + + self + .codes + .iter() + .map(|c| c.node_path.len() * 8) + .sum::() + } +} + +// ── FM-Index (in-memory, build-time) ───────────────────────────────────────── + +#[derive(Debug, Clone)] +pub struct FMIndex { + wavelet: HuffmanWaveletTree, + row_ids: Vec, + /// Sampled SA: sa_samples[i] = SA[i * SA_SAMPLE_RATE]. Size: N/D × 8 bytes. + sa_samples: Vec, + /// Starting byte offset of each document in the concatenated text. + doc_start_positions: Vec, + c_table: Vec, + alphabet_size: usize, +} + +impl DeepSizeOf for FMIndex { + fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + self.wavelet.deep_size() + + self.row_ids.len() * 8 + + self.sa_samples.len() * 8 + + self.doc_start_positions.len() * 8 + + self.c_table.len() * std::mem::size_of::() + } +} + +#[allow(dead_code)] +impl FMIndex { + fn build(texts: &[(u64, &[u8])]) -> Result { + if texts.is_empty() { + return Ok(Self { + wavelet: HuffmanWaveletTree { + nodes: Vec::new(), + codes: std::array::from_fn(|_| HuffmanCode::default()), + children: Vec::new(), + len: 0, + }, + row_ids: Vec::new(), + sa_samples: Vec::new(), + doc_start_positions: Vec::new(), + c_table: vec![0; 257], + alphabet_size: 256, + }); + } + + let mut concat = Vec::new(); + let mut doc_row_ids = Vec::new(); + let mut doc_starts: Vec = Vec::new(); + for (row_id, text) in texts { + doc_starts.push(concat.len() as u64); + doc_row_ids.push(*row_id); + concat.extend_from_slice(text); + concat.push(SENTINEL_BYTE); // \xFF separator between documents + } + // Append unique terminator \x00 so SA-IS produces a proper suffix array + // with a single-cycle LF-mapping permutation. + concat.push(0x00); + let n = concat.len(); + let sa = build_suffix_array(&concat); + + let bwt: Vec = sa + .iter() + .map(|&pos| { + if pos == 0 { + concat[n - 1] + } else { + concat[pos - 1] + } + }) + .collect(); + + let mut counts = vec![0usize; 257]; + for &b in &concat { + counts[b as usize + 1] += 1; + } + for i in 1..257 { + counts[i] += counts[i - 1]; + } + + // Sampled SA: store every D-th entry + let sa_samples: Vec = sa + .iter() + .step_by(SA_SAMPLE_RATE) + .map(|&pos| pos as u64) + .collect(); + + let wavelet = HuffmanWaveletTree::build(&bwt); + + Ok(Self { + wavelet, + row_ids: doc_row_ids, + sa_samples, + doc_start_positions: doc_starts, + c_table: counts, + alphabet_size: 256, + }) + } + + /// Locate: resolve SA[pos] by walking LF-mapping until hitting a sampled position. + /// For large data (N >> SA_SAMPLE_RATE), converges within SA_SAMPLE_RATE steps. + /// For small data with short LF cycles, may need up to N steps. + #[inline] + fn locate(&self, mut pos: usize) -> usize { + let mut steps = 0; + let n = self.wavelet.len; + loop { + if pos.is_multiple_of(SA_SAMPLE_RATE) && (pos / SA_SAMPLE_RATE) < self.sa_samples.len() + { + return (self.sa_samples[pos / SA_SAMPLE_RATE] as usize + steps) % n; + } + let c = self.wavelet.access(pos); + pos = self.c_table[c as usize] + self.wavelet.rank(c, pos); + steps += 1; + if steps >= n { + log::warn!("FM-Index SA locate exceeded {n} steps, possible index corruption"); + return 0; + } + } + } + + /// Map a text position to document index via binary search on doc_start_positions. + #[inline] + fn doc_for_position(&self, text_pos: usize) -> usize { + let tp = text_pos as u64; + match self.doc_start_positions.binary_search(&tp) { + Ok(idx) => idx, + Err(idx) => idx - 1, + } + } + + fn backward_search(&self, pattern: &[u8]) -> (usize, usize) { + if pattern.is_empty() || self.wavelet.len == 0 { + return (0, 0); + } + let (mut lo, mut hi) = (0, self.wavelet.len); + for &b in pattern.iter().rev() { + let c = self.c_table[b as usize]; + let (occ_lo, occ_hi) = self.wavelet.rank_pair(b, lo, hi); + lo = c + occ_lo; + hi = c + occ_hi; + if lo >= hi { + return (0, 0); + } + } + (lo, hi) + } + + fn search(&self, pattern: &[u8]) -> RoaringBitmap { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return RoaringBitmap::new(); + } + let mut result = RoaringBitmap::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + result.insert(self.row_ids[doc_idx] as u32); + } + result + } + + fn serialize_huffman_codes(&self) -> Vec { + let mut buf = Vec::new(); + for code in &self.wavelet.codes { + buf.extend_from_slice(&code.bits.to_le_bytes()); + buf.push(code.length); + buf.extend_from_slice(&(code.node_path.len() as u16).to_le_bytes()); + for &nid in &code.node_path { + buf.extend_from_slice(&(nid as u32).to_le_bytes()); + } + } + buf + } + + fn deserialize_huffman_codes(data: &[u8]) -> [HuffmanCode; 256] { + let mut codes: [HuffmanCode; 256] = std::array::from_fn(|_| HuffmanCode::default()); + let mut cur = 0; + for code in &mut codes { + let bits = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()); + cur += 4; + let length = data[cur]; + cur += 1; + let plen = u16::from_le_bytes(data[cur..cur + 2].try_into().unwrap()) as usize; + cur += 2; + let mut node_path = Vec::with_capacity(plen); + for _ in 0..plen { + node_path.push(u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()) as usize); + cur += 4; + } + *code = HuffmanCode { + bits, + length, + node_path, + }; + } + codes + } + + fn serialize_tree_topology(&self) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&(self.wavelet.children.len() as u32).to_le_bytes()); + for (left, right) in &self.wavelet.children { + for child in [left, right] { + match child { + WaveletChild::Node(id) => { + buf.push(0); + buf.extend_from_slice(&(*id as u32).to_le_bytes()); + } + WaveletChild::Leaf(b) => { + buf.push(1); + buf.extend_from_slice(&(*b as u32).to_le_bytes()); + } + } + } + } + buf + } + + fn deserialize_tree_topology(data: &[u8]) -> Vec<(WaveletChild, WaveletChild)> { + let mut cur = 0; + let count = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()) as usize; + cur += 4; + let mut children = Vec::with_capacity(count); + for _ in 0..count { + let mut read_child = || { + let t = data[cur]; + cur += 1; + let v = u32::from_le_bytes(data[cur..cur + 4].try_into().unwrap()); + cur += 4; + if t == 0 { + WaveletChild::Node(v as usize) + } else { + WaveletChild::Leaf(v as u8) + } + }; + let l = read_child(); + let r = read_child(); + children.push((l, r)); + } + children + } + + fn serialize_c_table(&self) -> Vec { + self.c_table + .iter() + .flat_map(|&v| (v as u64).to_le_bytes()) + .collect() + } + + fn deserialize_c_table(data: &[u8]) -> Vec { + data.chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap()) as usize) + .collect() + } + + fn u64_to_bytes(data: &[u64]) -> Vec { + data.iter().flat_map(|v| v.to_le_bytes()).collect() + } + + fn build_wavelet_batch(&self) -> Result { + use arrow_array::{LargeBinaryArray, UInt32Array, UInt64Array}; + let mut nid_b = Vec::new(); + let mut bid_b = Vec::new(); + let mut words_b: Vec> = Vec::new(); + let mut pr_b = Vec::new(); + let mut bl_b = Vec::new(); + + for (i, node) in self.wavelet.nodes.iter().enumerate() { + let mut pr: u64 = 0; + if node.words.is_empty() { + nid_b.push(i as u32); + bid_b.push(0u32); + words_b.push(Vec::new()); + pr_b.push(0u64); + bl_b.push(node.len as u64); + } else { + for (bi, chunk) in node.words.chunks(BLOCK_WORDS).enumerate() { + nid_b.push(i as u32); + bid_b.push(bi as u32); + words_b.push(Self::u64_to_bytes(chunk)); + pr_b.push(pr); + bl_b.push(node.len as u64); + pr += chunk.iter().map(|w| w.count_ones() as u64).sum::(); + } + } + } + let refs: Vec<&[u8]> = words_b.iter().map(|v| v.as_slice()).collect(); + let schema = Arc::new(Self::block_schema()); + Ok(RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt32Array::from(nid_b)), + Arc::new(UInt32Array::from(bid_b)), + Arc::new(LargeBinaryArray::from(refs)), + Arc::new(UInt64Array::from(pr_b)), + Arc::new(UInt64Array::from(bl_b)), + ], + )?) + } + + fn block_schema() -> arrow_schema::Schema { + arrow_schema::Schema::new(vec![ + Field::new("node_id", DataType::UInt32, false), + Field::new("block_id", DataType::UInt32, false), + Field::new("words", DataType::LargeBinary, false), + Field::new("prefix_rank", DataType::UInt64, false), + Field::new("bit_len", DataType::UInt64, false), + ]) + } +} + +// ── Lazy FM-Index ──────────────────────────────────────────────────────────── + +#[derive(Debug)] +struct LazyFMIndex { + wavelet: LazyHuffmanWaveletTree, + row_ids: Vec, + sa_samples: Vec, + doc_start_positions: Vec, + c_table: Vec, +} + +impl LazyFMIndex { + /// Pre-load all wavelet tree blocks before sync search operations. + async fn prewarm(&self) -> Result<()> { + self.wavelet.load_all().await + } + + fn backward_search(&self, pattern: &[u8]) -> (usize, usize) { + if pattern.is_empty() || self.wavelet.len == 0 { + return (0, 0); + } + let (mut lo, mut hi) = (0, self.wavelet.len); + for &b in pattern.iter().rev() { + let c = self.c_table[b as usize]; + let (occ_lo, occ_hi) = self.wavelet.rank_pair(b, lo, hi); + lo = c + occ_lo; + hi = c + occ_hi; + if lo >= hi { + return (0, 0); + } + } + (lo, hi) + } + + #[inline] + fn locate(&self, mut pos: usize) -> usize { + let mut steps = 0; + let n = self.wavelet.len; + loop { + if pos.is_multiple_of(SA_SAMPLE_RATE) && (pos / SA_SAMPLE_RATE) < self.sa_samples.len() + { + return (self.sa_samples[pos / SA_SAMPLE_RATE] as usize + steps) % n; + } + let c = self.wavelet.access(pos); + pos = self.c_table[c as usize] + self.wavelet.rank(c, pos); + steps += 1; + if steps >= n { + log::warn!("FM-Index SA locate exceeded {n} steps, possible index corruption"); + return 0; + } + } + } + + #[inline] + fn doc_for_position(&self, text_pos: usize) -> usize { + let tp = text_pos as u64; + match self.doc_start_positions.binary_search(&tp) { + Ok(idx) => idx, + Err(idx) => idx - 1, + } + } + + fn search(&self, pattern: &[u8]) -> RoaringBitmap { + let (lo, hi) = self.backward_search(pattern); + if lo >= hi { + return RoaringBitmap::new(); + } + let mut result = RoaringBitmap::new(); + for i in lo..hi { + let text_pos = self.locate(i); + let doc_idx = self.doc_for_position(text_pos); + result.insert(self.row_ids[doc_idx] as u32); + } + result + } + + #[allow(clippy::too_many_arguments)] + async fn from_reader( + reader: Arc, + num_bwt_nodes: usize, + huffman_codes: [HuffmanCode; 256], + children: Vec<(WaveletChild, WaveletChild)>, + c_table: Vec, + bwt_len: usize, + total_wavelet_rows: usize, + num_sa_blocks: usize, + sa_samples_len: usize, + row_ids: Vec, + doc_start_positions: Vec, + ) -> Result { + use arrow_array::UInt64Array; + + let meta = reader + .read_range( + 0..total_wavelet_rows, + Some(&["node_id", "prefix_rank", "bit_len"]), + ) + .await?; + let nid_col = meta + .column_by_name("node_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let pr_col = meta + .column_by_name("prefix_rank") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let bl_col = meta + .column_by_name("bit_len") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + struct NM { + prs: Vec, + offset: usize, + blen: usize, + } + let mut nms: Vec = (0..num_bwt_nodes) + .map(|_| NM { + prs: Vec::new(), + offset: 0, + blen: 0, + }) + .collect(); + for row in 0..meta.num_rows() { + let nid = nid_col.value(row) as usize; + if nid >= num_bwt_nodes { + continue; + } + let nm = &mut nms[nid]; + if nm.prs.is_empty() { + nm.offset = row; + } + nm.prs.push(pr_col.value(row)); + nm.blen = bl_col.value(row) as usize; + } + + let mut bwt_nodes = Vec::with_capacity(num_bwt_nodes); + for nm in &nms { + bwt_nodes.push(LazyRankBitVec::new( + nm.prs.clone(), + nm.prs.len(), + reader.clone(), + nm.offset, + nm.blen, + )); + } + let wavelet = LazyHuffmanWaveletTree { + nodes: bwt_nodes, + codes: huffman_codes, + children, + len: bwt_len, + }; + + // Read SA samples from packed binary blocks + let mut sa_samples = Vec::with_capacity(sa_samples_len); + let sa_batch = reader + .read_range( + total_wavelet_rows..total_wavelet_rows + num_sa_blocks, + Some(&["words"]), + ) + .await?; + let words_col = sa_batch + .column_by_name("words") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..sa_batch.num_rows() { + let raw = words_col.value(i); + for chunk in raw.chunks_exact(8) { + sa_samples.push(u64::from_le_bytes(chunk.try_into().unwrap())); + } + } + sa_samples.truncate(sa_samples_len); + + Ok(Self { + wavelet, + row_ids, + sa_samples, + doc_start_positions, + c_table, + }) + } + + fn deep_size(&self) -> usize { + self.wavelet.deep_size() + + self.row_ids.len() * 8 + + self.sa_samples.len() * 8 + + self.doc_start_positions.len() * 8 + + self.c_table.len() * std::mem::size_of::() + } +} + +// ── FMIndexScalarIndex ─────────────────────────────────────────────────────── + +#[derive(Debug)] +struct FMIndexPartition { + #[allow(dead_code)] + id: u64, + fm: LazyFMIndex, +} + +#[derive(Debug)] +pub struct FMIndexScalarIndex { + partitions: Vec>, +} + +impl DeepSizeOf for FMIndexScalarIndex { + fn deep_size_of_children(&self, _ctx: &mut deepsize::Context) -> usize { + self.partitions.iter().map(|p| p.fm.deep_size()).sum() + } +} + +impl FMIndexScalarIndex { + async fn load_partition( + store: &dyn IndexStore, + filename: &str, + pid: u64, + ) -> Result { + let reader = store.open_index_file(filename).await?; + let md = &reader.schema().metadata; + + let parse = |key: &str| -> Result { + md.get(key) + .ok_or_else(|| Error::invalid_input(format!("missing {key}")))? + .parse() + .map_err(|e| Error::invalid_input(format!("invalid {key}: {e}"))) + }; + + let num_bwt_nodes = parse("num_bwt_nodes")?; + let bwt_len = parse("bwt_len")?; + let num_sa_blocks = parse("num_sa_blocks")?; + let sa_samples_len = parse("sa_samples_len")?; + let total_wavelet_rows = parse("total_wavelet_rows")?; + + let c_table = FMIndex::deserialize_c_table(&hex_decode( + md.get("c_table") + .ok_or_else(|| Error::invalid_input("missing c_table"))?, + )?); + let huffman_codes = FMIndex::deserialize_huffman_codes(&hex_decode( + md.get("huffman_codes") + .ok_or_else(|| Error::invalid_input("missing huffman_codes"))?, + )?); + let children = FMIndex::deserialize_tree_topology(&hex_decode( + md.get("tree_topology") + .ok_or_else(|| Error::invalid_input("missing tree_topology"))?, + )?); + + // row_ids and doc_start_positions stored in metadata (small) + let row_ids_hex = md + .get("row_ids") + .ok_or_else(|| Error::invalid_input("missing row_ids"))?; + let row_ids_bytes = hex_decode(row_ids_hex)?; + let row_ids: Vec = row_ids_bytes + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect(); + + let doc_starts_hex = md + .get("doc_start_positions") + .ok_or_else(|| Error::invalid_input("missing doc_start_positions"))?; + let doc_starts_bytes = hex_decode(doc_starts_hex)?; + let doc_start_positions: Vec = doc_starts_bytes + .chunks_exact(8) + .map(|c| u64::from_le_bytes(c.try_into().unwrap())) + .collect(); + + let fm = Box::pin(LazyFMIndex::from_reader( + reader, + num_bwt_nodes, + huffman_codes, + children, + c_table, + bwt_len, + total_wavelet_rows, + num_sa_blocks, + sa_samples_len, + row_ids, + doc_start_positions, + )) + .await?; + Ok(FMIndexPartition { id: pid, fm }) + } + + async fn load( + store: Arc, + _fri: Option>, + _cache: &LanceCache, + ) -> Result> { + let files = store.list_files_with_sizes().await?; + let mut pfiles: Vec<(u64, String)> = Vec::new(); + for f in &files { + if let Some(id) = f + .path + .strip_prefix("part_") + .and_then(|r| r.strip_suffix("_fmindex.lance")) + .and_then(|s| s.parse::().ok()) + { + pfiles.push((id, f.path.clone())); + } + } + if pfiles.is_empty() { + return Err(Error::invalid_input("no FM-Index partition files found")); + } + pfiles.sort_by_key(|(id, _)| *id); + let mut parts = Vec::with_capacity(pfiles.len()); + for (id, name) in &pfiles { + parts.push(Arc::new( + Self::load_partition(store.as_ref(), name, *id).await?, + )); + } + Ok(Arc::new(Self { partitions: parts })) + } +} + +#[async_trait] +impl Index for FMIndexScalarIndex { + fn as_any(&self) -> &dyn std::any::Any { + self + } + fn as_index(self: Arc) -> Arc { + self + } + fn as_vector_index(self: Arc) -> Result> { + Err(Error::invalid_input_source( + "FMIndex is not a vector index".into(), + )) + } + async fn prewarm(&self) -> Result<()> { + Ok(()) + } + fn statistics(&self) -> Result { + Ok(serde_json::json!({ + "type": "FMIndex", + "num_partitions": self.partitions.len(), + "total_bwt_len": self.partitions.iter().map(|p| p.fm.wavelet.len).sum::(), + "total_docs": self.partitions.iter().map(|p| p.fm.row_ids.len()).sum::(), + })) + } + fn index_type(&self) -> IndexType { + IndexType::FMIndex + } + async fn calculate_included_frags(&self) -> Result { + let mut frags = RoaringBitmap::new(); + for p in &self.partitions { + for &rid in &p.fm.row_ids { + frags.insert((rid >> 32) as u32); + } + } + Ok(frags) + } +} + +#[async_trait] +impl ScalarIndex for FMIndexScalarIndex { + async fn search( + &self, + query: &dyn AnyQuery, + _metrics: &dyn MetricsCollector, + ) -> Result { + let tq = query + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("FMIndex only supports TextQuery"))?; + match tq { + TextQuery::StringContains(pattern) => { + let pb = pattern.as_bytes(); + use lance_select::RowAddrTreeMap; + let mut tree = RowAddrTreeMap::new(); + for p in &self.partitions { + p.fm.prewarm().await?; + for rid in p.fm.search(pb).iter() { + tree.insert(rid as u64); + } + } + Ok(SearchResult::Exact(lance_select::NullableRowAddrSet::new( + tree, + Default::default(), + ))) + } + } + } + fn can_remap(&self) -> bool { + false + } + async fn remap( + &self, + _: &HashMap>, + _: &dyn IndexStore, + ) -> Result { + Err(Error::not_supported("FMIndex does not support remap")) + } + async fn update( + &self, + new_data: SendableRecordBatchStream, + dest: &dyn IndexStore, + _: Option, + ) -> Result { + let texts = collect_texts(new_data).await?; + write_partitioned_fmindex(&texts, dest).await?; + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), + index_version: FMINDEX_INDEX_VERSION, + files: Some(dest.list_files_with_sizes().await?), + }) + } + fn update_criteria(&self) -> UpdateCriteria { + UpdateCriteria::only_new_data(TrainingCriteria::new(TrainingOrdering::None)) + } + fn derive_index_params(&self) -> Result { + Ok(ScalarIndexParams::for_builtin(BuiltinIndexType::FMIndex)) + } +} + +// ── Helpers ────────────────────────────────────────────────────────────────── + +async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result)>> { + let mut texts = Vec::new(); + let mut next_id = 0u64; + while let Some(batch) = stream.next().await { + let batch = batch?; + let row_ids: Option<&arrow_array::UInt64Array> = batch + .column_by_name("_rowid") + .or_else(|| batch.column_by_name("_rowaddr")) + .and_then(|c| c.as_any().downcast_ref()); + let value_col = batch.column(0); + for i in 0..batch.num_rows() { + let rid = row_ids.map(|ids| ids.value(i)).unwrap_or_else(|| { + let id = next_id; + next_id += 1; + id + }); + if let Some(bytes) = extract_text_bytes(value_col.as_ref(), i)? { + let sanitized: Vec = bytes + .iter() + .map(|&b| { + if b == SENTINEL_BYTE || b == 0x00 { + b' ' + } else { + b + } + }) + .collect(); + texts.push((rid, sanitized)); + } + } + } + Ok(texts) +} + +fn extract_text_bytes(array: &dyn arrow_array::Array, index: usize) -> Result>> { + if array.is_null(index) { + return Ok(None); + } + match array.data_type() { + DataType::Utf8 => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes() + .to_vec(), + )), + DataType::LargeUtf8 => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes() + .to_vec(), + )), + DataType::Binary => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .to_vec(), + )), + DataType::LargeBinary => Ok(Some( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .to_vec(), + )), + _ => Err(Error::invalid_input(format!( + "FMIndex does not support data type: {:?}", + array.data_type() + ))), + } +} + +fn hex_encode(data: &[u8]) -> String { + data.iter().map(|b| format!("{b:02x}")).collect() +} +fn hex_decode(s: &str) -> Result> { + if !s.len().is_multiple_of(2) { + return Err(Error::invalid_input("invalid hex length")); + } + (0..s.len()) + .step_by(2) + .map(|i| { + u8::from_str_radix(&s[i..i + 2], 16) + .map_err(|e| Error::invalid_input(format!("invalid hex: {e}"))) + }) + .collect() +} + +/// Write an FM-Index partition to storage. +/// +/// Layout: +/// - Wavelet block rows (BWT nodes) +/// - SA sample blocks (packed u64 in LargeBinary) +/// - Metadata: c_table, huffman_codes, tree_topology, row_ids, doc_start_positions +async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> Result<()> { + let schema = Arc::new(FMIndex::block_schema()); + + let mut writer = store.new_index_file(filename, schema.clone()).await?; + + // 1. Wavelet blocks + let wb = fm.build_wavelet_batch()?; + let nw = wb.num_rows(); + writer.write_record_batch(wb).await?; + + // 2. SA samples packed as binary blocks + let u64s_per_block = BLOCK_WORDS; // 4096 u64s per block = 32KB + let mut sa_nid = Vec::new(); + let mut sa_bid = Vec::new(); + let mut sa_words: Vec> = Vec::new(); + let mut sa_pr = Vec::new(); + let mut sa_bl = Vec::new(); + for (bi, chunk) in fm.sa_samples.chunks(u64s_per_block).enumerate() { + sa_nid.push(u32::MAX); + sa_bid.push(bi as u32); + sa_words.push(FMIndex::u64_to_bytes(chunk)); + sa_pr.push(0u64); + sa_bl.push(fm.sa_samples.len() as u64); + } + let num_sa_blocks = sa_nid.len(); + if num_sa_blocks > 0 { + let refs: Vec<&[u8]> = sa_words.iter().map(|v| v.as_slice()).collect(); + writer + .write_record_batch(RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(arrow_array::UInt32Array::from(sa_nid)), + Arc::new(arrow_array::UInt32Array::from(sa_bid)), + Arc::new(arrow_array::LargeBinaryArray::from(refs)), + Arc::new(arrow_array::UInt64Array::from(sa_pr)), + Arc::new(arrow_array::UInt64Array::from(sa_bl)), + ], + )?) + .await?; + } + + // Metadata + let mut metadata = HashMap::new(); + metadata.insert("num_bwt_nodes".into(), fm.wavelet.nodes.len().to_string()); + metadata.insert("bwt_len".into(), fm.wavelet.len.to_string()); + metadata.insert("num_sa_blocks".into(), num_sa_blocks.to_string()); + metadata.insert("sa_samples_len".into(), fm.sa_samples.len().to_string()); + metadata.insert("total_wavelet_rows".into(), nw.to_string()); + metadata.insert("sa_sample_rate".into(), SA_SAMPLE_RATE.to_string()); + metadata.insert("alphabet_size".into(), fm.alphabet_size.to_string()); + metadata.insert("c_table".into(), hex_encode(&fm.serialize_c_table())); + metadata.insert( + "huffman_codes".into(), + hex_encode(&fm.serialize_huffman_codes()), + ); + metadata.insert( + "tree_topology".into(), + hex_encode(&fm.serialize_tree_topology()), + ); + // row_ids in metadata (10K × 8 = 80KB per partition — small) + let row_ids_bytes: Vec = fm.row_ids.iter().flat_map(|&v| v.to_le_bytes()).collect(); + metadata.insert("row_ids".into(), hex_encode(&row_ids_bytes)); + // doc_start_positions in metadata (10K × 8 = 80KB per partition — small) + let doc_starts_bytes: Vec = fm + .doc_start_positions + .iter() + .flat_map(|&v| v.to_le_bytes()) + .collect(); + metadata.insert("doc_start_positions".into(), hex_encode(&doc_starts_bytes)); + + writer.finish_with_metadata(metadata).await?; + Ok(()) +} + +async fn write_partitioned_fmindex(texts: &[(u64, Vec)], store: &dyn IndexStore) -> Result<()> { + let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); + if refs.is_empty() { + let fm = FMIndex::build(&[])?; + write_fmindex(&fm, store, &fmindex_partition_path(0)).await?; + return Ok(()); + } + for (pid, chunk) in refs.chunks(PARTITION_SIZE).enumerate() { + let fm = FMIndex::build(chunk)?; + write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?; + } + Ok(()) +} + +// ── Plugin ─────────────────────────────────────────────────────────────────── + +#[derive(Debug, Default)] +pub struct FMIndexPlugin; + +#[async_trait] +impl ScalarIndexPlugin for FMIndexPlugin { + fn name(&self) -> &str { + "FMIndex" + } + fn new_training_request( + &self, + _params: &str, + field: &Field, + ) -> Result> { + match field.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => {} + _ => { + return Err(Error::invalid_input(format!( + "FM-Index does not support {:?}", + field.data_type() + ))); + } + } + Ok(Box::new(DefaultTrainingRequest::new( + TrainingCriteria::new(TrainingOrdering::None), + ))) + } + async fn train_index( + &self, + data: SendableRecordBatchStream, + store: &dyn IndexStore, + _req: Box, + _fids: Option>, + _progress: Arc, + ) -> Result { + let texts = collect_texts(data).await?; + write_partitioned_fmindex(&texts, store).await?; + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), + index_version: FMINDEX_INDEX_VERSION, + files: Some(store.list_files_with_sizes().await?), + }) + } + fn provides_exact_answer(&self) -> bool { + true + } + fn version(&self) -> u32 { + FMINDEX_INDEX_VERSION + } + fn new_query_parser( + &self, + index_name: String, + _details: &prost_types::Any, + ) -> Option> { + Some(Box::new(TextQueryParser::new( + index_name, + self.name().to_string(), + false, + ))) + } + async fn load_index( + &self, + store: Arc, + details: &prost_types::Any, + fri: Option>, + cache: &LanceCache, + ) -> Result> { + let _ = details + .to_msg::() + .unwrap_or_default(); + Ok(FMIndexScalarIndex::load(store, fri, cache).await? as Arc) + } + async fn load_statistics( + &self, + _: Arc, + _: &prost_types::Any, + ) -> Result> { + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use lance_core::cache::LanceCache; + use lance_io::object_store::ObjectStore; + use object_store::path::Path; + use std::sync::Arc; + + use crate::scalar::lance_format::LanceIndexStore; + + #[test] + fn test_fmindex_build_and_search() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"hello world"), + (1, b"hello rust"), + (2, b"goodbye world"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let r = fm.search(b"hello"); + assert!(r.contains(0)); + assert!(r.contains(1)); + assert!(!r.contains(2)); + + let r = fm.search(b"world"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + let r = fm.search(b"goodbye"); + assert!(!r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + assert!(fm.search(b"xyz").is_empty()); + } + + #[test] + fn test_fmindex_empty() { + let fm = FMIndex::build(&[]).unwrap(); + assert!(fm.search(b"anything").is_empty()); + } + + #[test] + fn test_fmindex_single_char_search() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"abc"), (1, b"def")]; + let fm = FMIndex::build(&texts).unwrap(); + assert!(fm.search(b"a").contains(0)); + assert!(!fm.search(b"a").contains(1)); + assert!(!fm.search(b"d").contains(0)); + assert!(fm.search(b"d").contains(1)); + } + + #[test] + fn test_fmindex_repeated_pattern() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"ababab"), (1, b"cdcd")]; + let fm = FMIndex::build(&texts).unwrap(); + assert!(fm.search(b"ab").contains(0)); + assert!(!fm.search(b"ab").contains(1)); + assert!(!fm.search(b"cd").contains(0)); + assert!(fm.search(b"cd").contains(1)); + } + + #[test] + fn test_early_exit_all_docs_match() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"the cat"), (1, b"the dog"), (2, b"the bird")]; + let fm = FMIndex::build(&texts).unwrap(); + assert_eq!(fm.search(b"the").len(), 3); + } + + #[test] + fn test_locate_correctness() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"the quick brown fox jumps over the lazy dog"), + (1, b"pack my box with five dozen liquor jugs"), + (2, b"how vexingly quick daft zebras jump"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let r = fm.search(b"quick"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + let r = fm.search(b"the"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(!r.contains(2)); + + let r = fm.search(b"jump"); + assert!(r.contains(0)); + assert!(r.contains(2)); + } + + #[test] + fn test_many_documents() { + let docs: Vec> = (0..100) + .map(|i| format!("document number {} with hello world data xyz", i).into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + assert_eq!(fm.search(b"hello world").len(), 100); + assert_eq!(fm.search(b"document number 42").len(), 1); + assert_eq!(fm.search(b"nonexistent").len(), 0); + } + + #[test] + fn test_index_size_ratio() { + let docs: Vec> = (0..200) + .map(|i| { + format!( + "document {} with enough text to test size ratio properly end", + i + ) + .into_bytes() + }) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + let text_size: usize = docs.iter().map(|d| d.len()).sum(); + let wavelet_size = fm.wavelet.deep_size(); + let sa_size = fm.sa_samples.len() * 8; + let total = wavelet_size + sa_size; + + let ratio = total as f64 / text_size as f64; + assert!( + ratio < 1.5, + "index should be much smaller than text, got ratio={ratio:.2}" + ); + } + + #[test] + fn test_wavelet_access_consistency() { + let docs: Vec> = (0..50) + .map(|i| format!("document {i} hello world test").into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + + let mut concat = Vec::new(); + for (_, text) in &texts { + concat.extend_from_slice(text); + concat.push(SENTINEL_BYTE); + } + concat.push(0x00); + let sa = build_suffix_array(&concat); + let n = concat.len(); + let bwt: Vec = sa + .iter() + .map(|&pos| { + if pos == 0 { + concat[n - 1] + } else { + concat[pos - 1] + } + }) + .collect(); + let wavelet = HuffmanWaveletTree::build(&bwt); + + for (i, &expected) in bwt.iter().enumerate().take(n.min(500)) { + assert_eq!(wavelet.access(i), expected, "access mismatch at {i}"); + } + } + + #[test] + fn test_serialization_roundtrip() { + let texts: Vec<(u64, &[u8])> = vec![ + (10, b"alpha beta gamma"), + (20, b"beta gamma delta"), + (30, b"gamma delta epsilon"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + // Test huffman codes roundtrip + let hc_bytes = fm.serialize_huffman_codes(); + let hc = FMIndex::deserialize_huffman_codes(&hc_bytes); + for (i, (loaded, original)) in hc.iter().zip(fm.wavelet.codes.iter()).enumerate() { + assert_eq!(loaded.bits, original.bits, "bits mismatch at {i}"); + assert_eq!(loaded.length, original.length, "length mismatch at {i}"); + assert_eq!(loaded.node_path, original.node_path, "path mismatch at {i}"); + } + + // Test tree topology roundtrip + let topo_bytes = fm.serialize_tree_topology(); + let topo = FMIndex::deserialize_tree_topology(&topo_bytes); + assert_eq!(topo.len(), fm.wavelet.children.len()); + + // Test c_table roundtrip + let ct_bytes = fm.serialize_c_table(); + let ct = FMIndex::deserialize_c_table(&ct_bytes); + assert_eq!(ct, fm.c_table); + } + + #[test] + fn test_hex_roundtrip() { + let data = vec![0u8, 1, 127, 255, 42]; + let encoded = hex_encode(&data); + let decoded = hex_decode(&encoded).unwrap(); + assert_eq!(data, decoded); + } + + #[test] + fn test_sentinel_sanitization() { + // Text containing \xFF should be sanitized to space + let texts: Vec<(u64, &[u8])> = vec![(0, b"hello\xFFworld")]; + let fm = FMIndex::build(&texts).unwrap(); + // The \xFF is replaced with space during collect_texts, but here we test build directly + // which doesn't sanitize. The search should still work. + let r = fm.search(b"hello"); + assert!(r.contains(0)); + } + + #[test] + fn test_wavelet_rank_pair_consistency() { + let docs: Vec> = (0..30) + .map(|i| format!("doc {i} with repeated words hello world test data").into_bytes()) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + let n = fm.wavelet.len; + for b in [b'a', b'e', b' ', SENTINEL_BYTE] { + for &(lo, hi) in &[(0usize, 1usize), (0, n), (n / 4, n / 2)] { + if lo >= n || hi > n || lo >= hi { + continue; + } + let (pl, ph) = fm.wavelet.rank_pair(b, lo, hi); + let rl = fm.wavelet.rank(b, lo); + let rh = fm.wavelet.rank(b, hi); + assert_eq!(pl, rl, "rank_pair lo mismatch for b={b} [{lo},{hi})"); + assert_eq!(ph, rh, "rank_pair hi mismatch for b={b} [{lo},{hi})"); + } + } + } + + #[test] + fn test_large_sa_sampling() { + // Test with enough documents to have multiple SA sample points + let docs: Vec> = (0..50) + .map(|i| { + format!( + "document number {} with lots of text to ensure we have enough bytes for multiple SA samples across the suffix array positions", + i + ) + .into_bytes() + }) + .collect(); + let texts: Vec<(u64, &[u8])> = docs + .iter() + .enumerate() + .map(|(i, d)| (i as u64, d.as_slice())) + .collect(); + let fm = FMIndex::build(&texts).unwrap(); + + assert!(fm.sa_samples.len() > 1, "should have multiple SA samples"); + assert_eq!(fm.search(b"document number 25").len(), 1); + assert_eq!(fm.search(b"document number").len(), 50); + assert_eq!(fm.search(b"nonexistent pattern").len(), 0); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_write_and_load_roundtrip() { + let texts: Vec<(u64, &[u8])> = vec![ + (0, b"hello world foo bar"), + (1, b"hello rust baz qux"), + (2, b"goodbye world quux"), + ]; + let fm = FMIndex::build(&texts).unwrap(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + // Write + write_fmindex(&fm, store.as_ref(), &fmindex_partition_path(0)) + .await + .unwrap(); + + // Load + let part = + FMIndexScalarIndex::load_partition(store.as_ref(), &fmindex_partition_path(0), 0) + .await + .unwrap(); + + // Verify search results match + let r = part.fm.search(b"hello"); + assert!(r.contains(0)); + assert!(r.contains(1)); + assert!(!r.contains(2)); + + let r = part.fm.search(b"world"); + assert!(r.contains(0)); + assert!(!r.contains(1)); + assert!(r.contains(2)); + + assert!(part.fm.search(b"xyz").is_empty()); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_partitioned_write_and_load() { + let docs: Vec> = (0..30) + .map(|i| format!("document {i} hello world test data").into_bytes()) + .collect(); + let texts: Vec<(u64, Vec)> = docs + .into_iter() + .enumerate() + .map(|(i, d)| (i as u64, d)) + .collect(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + write_partitioned_fmindex(&texts, store.as_ref()) + .await + .unwrap(); + + let index = FMIndexScalarIndex::load(store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + // Search across partitions + let r = index + .search( + &TextQuery::StringContains("hello world".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(30)); + } + _ => panic!("expected exact result"), + } + + let r = index + .search( + &TextQuery::StringContains("document 15".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(1)); + } + _ => panic!("expected exact result"), + } + + let r = index + .search( + &TextQuery::StringContains("nonexistent".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(0)); + } + _ => panic!("expected exact result"), + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_plugin_train_and_load() { + use arrow_array::{StringArray, UInt64Array}; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use futures::stream; + use lance_core::ROW_ID; + + let docs = vec!["hello world", "hello rust", "goodbye world"]; + let row_ids: Vec = vec![0, 1, 2]; + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new( + crate::scalar::registry::VALUE_COLUMN_NAME, + DataType::Utf8, + false, + ), + arrow_schema::Field::new(ROW_ID, DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(docs)), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .unwrap(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)])); + let req = FMIndexPlugin + .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false)) + .unwrap(); + let created = FMIndexPlugin + .train_index( + Box::pin(stream), + store.as_ref(), + req, + None, + Arc::new(crate::progress::NoopIndexBuildProgress), + ) + .await + .unwrap(); + + let index = FMIndexPlugin + .load_index(store, &created.index_details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let r = index + .search( + &TextQuery::StringContains("hello".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(2)); + } + _ => panic!("expected exact result"), + } + } + + #[test] + fn test_build_wavelet_batch() { + let texts: Vec<(u64, &[u8])> = vec![(0, b"hello world"), (1, b"test data")]; + let fm = FMIndex::build(&texts).unwrap(); + let batch = fm.build_wavelet_batch().unwrap(); + assert!(batch.num_rows() > 0); + assert_eq!(batch.num_columns(), 5); + } + + #[test] + fn test_extract_text_bytes_types() { + use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray}; + + let utf8 = StringArray::from(vec!["hello"]); + assert_eq!( + extract_text_bytes(&utf8, 0).unwrap(), + Some(b"hello".to_vec()) + ); + + let large_utf8 = LargeStringArray::from(vec!["world"]); + assert_eq!( + extract_text_bytes(&large_utf8, 0).unwrap(), + Some(b"world".to_vec()) + ); + + let binary = BinaryArray::from(vec![b"bytes" as &[u8]]); + assert_eq!( + extract_text_bytes(&binary, 0).unwrap(), + Some(b"bytes".to_vec()) + ); + + let large_binary = LargeBinaryArray::from(vec![b"large" as &[u8]]); + assert_eq!( + extract_text_bytes(&large_binary, 0).unwrap(), + Some(b"large".to_vec()) + ); + + // Null handling + let nullable = StringArray::from(vec![None::<&str>]); + assert_eq!(extract_text_bytes(&nullable, 0).unwrap(), None); + } + + #[test] + fn test_fmindex_statistics() { + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + rt.block_on(async { + let docs: Vec> = (0..10).map(|i| format!("doc {i}").into_bytes()).collect(); + let texts: Vec<(u64, Vec)> = docs + .into_iter() + .enumerate() + .map(|(i, d)| (i as u64, d)) + .collect(); + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + write_partitioned_fmindex(&texts, store.as_ref()) + .await + .unwrap(); + let index = FMIndexScalarIndex::load(store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let stats = index.statistics().unwrap(); + assert_eq!(stats["type"], "FMIndex"); + assert_eq!(stats["total_docs"], 10); + assert!(stats["total_bwt_len"].as_u64().unwrap() > 0); + }); + } +} diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 20a63263222..07033a74616 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -423,6 +423,7 @@ fn legacy_type_name(index_uri: &str, index_type_hint: Option<&str>) -> String { "BloomFilter" => IndexType::BloomFilter.to_string(), "RTree" => IndexType::RTree.to_string(), "Inverted" => IndexType::Inverted.to_string(), + "FMIndex" => IndexType::FMIndex.to_string(), "Json" => IndexType::Scalar.to_string(), "Flat" | "Vector" => IndexType::Vector.to_string(), other if other.contains("Vector") => IndexType::Vector.to_string(), diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index a20efe63929..ff131c5ce23 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -222,6 +222,7 @@ impl<'a> CreateIndexBuilder<'a> { | IndexType::BTree | IndexType::Inverted | IndexType::NGram + | IndexType::FMIndex | IndexType::ZoneMap | IndexType::BloomFilter | IndexType::LabelList From 437849118f380d92c1ea849f99996e9072be58df Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Thu, 4 Jun 2026 18:44:11 +0000 Subject: [PATCH 021/177] chore: release beta version 8.0.0-beta.3 --- .bumpversion.toml | 2 +- Cargo.lock | 58 +++++++++++++------------- Cargo.toml | 42 +++++++++---------- java/lance-jni/Cargo.lock | 87 ++++++++++++++++++++++++++++----------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 50 +++++++++++----------- python/Cargo.toml | 2 +- 8 files changed, 143 insertions(+), 102 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 5e79b8cad2c..3c10a0473a8 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.2" +current_version = "8.0.0-beta.3" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index a03cd1f05bf..b78c27cb2bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1365,9 +1365,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4431,7 +4431,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "all_asserts", "approx", @@ -4534,7 +4534,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4582,7 +4582,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrayref", "paste", @@ -4591,7 +4591,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4661,7 +4661,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4681,7 +4681,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -4726,7 +4726,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "all_asserts", "arrow", @@ -4752,7 +4752,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -4792,7 +4792,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "datafusion", "geo-traits", @@ -4806,7 +4806,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "approx", "arc-swap", @@ -4885,7 +4885,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-arith", @@ -4934,7 +4934,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "approx", "arrow-array", @@ -4954,7 +4954,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "async-trait", @@ -4966,7 +4966,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-schema", @@ -4982,7 +4982,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-ipc", @@ -5038,7 +5038,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -5057,7 +5057,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -5104,7 +5104,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "proc-macro2", "quote", @@ -5113,7 +5113,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-schema", @@ -5126,7 +5126,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5138,7 +5138,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "clap", "lance-core", @@ -5340,9 +5340,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -7439,9 +7439,9 @@ dependencies = [ [[package]] name = "reqsign-volcengine-tos" -version = "3.0.0" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d757602a7ef2b6025c0da77e6d2e23fbdef35930fa466b15ffbf0a3f13acf7" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" dependencies = [ "anyhow", "http 1.4.1", diff --git a/Cargo.toml b/Cargo.toml index f144c3f2d19..fb49e862f58 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.2", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.2", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.2", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.2", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.2", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.2", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.2", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.2", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.2", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.2", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.2", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.2", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.2", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.3", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.3", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.3", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.3", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.3", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.3", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.3", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.3", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.3", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.3", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.3", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.0" -lance-select = { version = "=8.0.0-beta.2", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.2", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.2", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.2", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.2", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.3", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.3", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.3", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.3", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.3", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.2", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.3", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.2", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.3", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 9ae7f003df8..bc93593ed24 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -1137,9 +1137,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3721,7 +3721,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arc-swap", "arrow", @@ -3795,7 +3795,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -3837,7 +3837,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrayref", "paste", @@ -3846,7 +3846,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -3881,7 +3881,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -3913,7 +3913,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -3931,7 +3931,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -3966,7 +3966,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -3997,7 +3997,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "datafusion", "geo-traits", @@ -4011,7 +4011,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arc-swap", "arrow", @@ -4056,6 +4056,7 @@ dependencies = [ "lance-table", "lance-tokenizer", "libm", + "libsais-rs", "log", "ndarray", "num-traits", @@ -4080,7 +4081,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-arith", @@ -4122,7 +4123,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4158,7 +4159,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4174,7 +4175,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "async-trait", @@ -4186,7 +4187,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-ipc", @@ -4230,7 +4231,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4246,7 +4247,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4284,7 +4285,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -4385,6 +4386,15 @@ dependencies = [ "libc", ] +[[package]] +name = "libsais-rs" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fe164dbd47ea0c20e78a121c980ef673326905f1d4fba55e3645a20ef6717f" +dependencies = [ + "rayon", +] + [[package]] name = "link-section" version = "0.18.1" @@ -4420,9 +4430,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -4870,6 +4880,7 @@ dependencies = [ "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", + "opendal-service-tos", ] [[package]] @@ -5087,6 +5098,23 @@ dependencies = [ "url", ] +[[package]] +name = "opendal-service-tos" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" +dependencies = [ + "bytes", + "http 1.4.1", + "opendal-core", + "quick-xml 0.39.4", + "reqsign-core", + "reqsign-file-read-tokio", + "reqsign-volcengine-tos", + "serde", + "serde_json", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -5994,6 +6022,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "reqsign-volcengine-tos" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" +dependencies = [ + "anyhow", + "http 1.4.1", + "log", + "percent-encoding", + "reqsign-core", +] + [[package]] name = "reqwest" version = "0.12.28" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 8649360a870..090aedcae2f 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 579f8e52430..68390ec4128 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.2 + 8.0.0-beta.3 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index de95b2730dc..47bd677dbfa 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1302,9 +1302,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" dependencies = [ "iana-time-zone", "js-sys", @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4087,7 +4087,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arc-swap", "arrow", @@ -4162,7 +4162,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4204,7 +4204,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrayref", "paste", @@ -4213,7 +4213,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4248,7 +4248,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4298,7 +4298,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -4333,7 +4333,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-arith", "arrow-array", @@ -4364,7 +4364,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "datafusion", "geo-traits", @@ -4378,7 +4378,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arc-swap", "arrow", @@ -4449,7 +4449,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-arith", @@ -4491,7 +4491,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4507,7 +4507,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "async-trait", @@ -4519,7 +4519,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-ipc", @@ -4563,7 +4563,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow-array", "arrow-buffer", @@ -4579,7 +4579,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -4619,7 +4619,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "icu_segmenter", "jieba-rs", @@ -4856,9 +4856,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "113b30b4cd05f7c06868fdb2854f66a7b9fece9a48425351cd532e810d74024f" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" [[package]] name = "loom" @@ -6092,7 +6092,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" dependencies = [ "arrow", "arrow-array", @@ -6722,9 +6722,9 @@ dependencies = [ [[package]] name = "reqsign-volcengine-tos" -version = "3.0.0" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d757602a7ef2b6025c0da77e6d2e23fbdef35930fa466b15ffbf0a3f13acf7" +checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" dependencies = [ "anyhow", "http 1.4.1", diff --git a/python/Cargo.toml b/python/Cargo.toml index 68dcc5f65e9..dc78368145a 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.2" +version = "8.0.0-beta.3" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 2b4e4c8615e6bf73711961348a3b7eb0ef450890 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 13:52:46 -0700 Subject: [PATCH 022/177] chore(deps): bump idna from 3.10 to 3.15 in /python (#6847) Bumps [idna](https://github.com/kjd/idna) from 3.10 to 3.15.

Changelog

Sourced from idna's changelog.

3.15 (2026-05-12)

  • Enforce DNS-length cap on individual labels early in check_label, short-circuiting contextual-rule processing for oversized input while staying compatible with UTS 46 usage.
  • Tidy core helpers: hoist bidi category sets to module-level frozensets (avoiding per-codepoint list construction), simplify length checks, and reuse the shared _unicode_dots_re from idna.core in the codec module.
  • Use raise ... from err for proper exception chaining and switch internal string formatting to f-strings.
  • Allow flit_core 4.x in the build backend.
  • Expand the ruff lint set (flake8-bugbear, flake8-simplify, pyupgrade, perflint) and apply the surfaced fixes; pin lint CI to Python 3.14.
  • Add Dependabot configuration for GitHub Actions.
  • Convert README and HISTORY from reStructuredText to Markdown.
  • Reference CVE-2026-45409 for the 3.14 advisory in place of the initial GHSA identifier.

Thanks to Felix Yan, Stan Ulbrych, and metsw24-max for contributions to this release.

3.14 (2026-05-10)

  • Removed opportunity to process long inputs into quadratic time by rejecting oversize inputs up-front. Closes a bypass of the CVE-2024-3651 mitigation. [CVE-2026-45409]

Thanks to Stan Ulbrych for reporting the issue.

3.13 (2026-04-22)

  • Correct classification error for codepoint U+A7F1

3.12 (2026-04-21)

  • Update to Unicode 17.0.0.
  • Issue a deprecation warning for the transitional argument.
  • Added lazy-loading to provide some performance improvements.
  • Removed vestiges of code related to Python 2 support, including segmentation of data structures specific to Jython.

Thanks to Rodrigo Nogueira for contributions to this release.

3.11 (2025-10-12)

  • Update to Unicode 16.0.0, including significant changes to UTS46 processing. As a result of Unicode ending support for it, transitional processing no longer has an effect and returns the same result.

... (truncated)

Commits
  • af30a09 Release 3.15
  • 30314d4 Pre-release 3.15rc0
  • 05d4b21 Merge pull request #237 from kjd/convert-docs-to-markdown
  • 2987fdb Convert README and HISTORY from reStructuredText to Markdown
  • 59fa800 Merge pull request #236 from kjd/dependabot/github_actions/actions-f3e34333ea
  • def6983 Merge branch 'master' into dependabot/github_actions/actions-f3e34333ea
  • bbd8004 Merge pull request #234 from StanFromIreland/patch-1
  • edd07c0 Bump github/codeql-action from 3.35.2 to 4.35.2 in the actions group
  • 5557db0 Merge branch 'master' into patch-1
  • f11746c Merge pull request #235 from StanFromIreland/patch-2
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=idna&package-manager=uv&previous-version=3.10&new-version=3.15)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/lance-format/lance/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index 69e061b8075..fba7efc874f 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -981,11 +981,11 @@ wheels = [ [[package]] name = "idna" -version = "3.10" +version = "3.15" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload-time = "2024-09-15T18:07:39.745Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] [[package]] From f80b83a691390f2b9c06b693989391cef3063a74 Mon Sep 17 00:00:00 2001 From: Yu-Ju Huang <55553637+yuju-huang@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:25:31 -0700 Subject: [PATCH 023/177] perf(pq): wrap l2_targets in Arc to eliminate per-partition clones (#7093) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ProductQuantizer stores `l2_targets: Option>` — a pre-transposed SoA copy of the PQ codebook (~768 KB for a 768-dim, 48 sub-vector, 256-centroid index). Every call to `PQIndex::load()` (one per cached IVF partition) clones the entire ProductQuantizer, deep-copying this Vec. The data is logically identical across all partitions (derived from the same global codebook) but was physically duplicated N times. With 8000 partitions fully warmed, this wastes ~6 GB (768 KB × 8000). We measured this empirically: lance 4.0.1 used 2.9× more memory per cached partition than lance 0.39 (1894 KB vs 443 KB), with the excess tracing directly to l2_targets. Fix: change the field type to `Option>>`. Cloning a ProductQuantizer now bumps a reference count instead of copying megabytes; all partitions share one allocation. Changes (5 lines in one file): - Field type: Option> → Option>> - build_l2_targets return type updated to match - Construction: wrap collected Vec with Arc::new - DeepSizeOf: deref Arc before iterating (**v).iter() - build_l2_distance_table: targets.as_slice() to pass &[L2Prepared] --------- Co-authored-by: Yu-Ju Huang --- rust/lance-index/src/vector/pq.rs | 18 +++++++++--------- rust/lance-linalg/src/distance/l2.rs | 3 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/rust/lance-index/src/vector/pq.rs b/rust/lance-index/src/vector/pq.rs index 3d11fc4a99e..bbb05238a30 100644 --- a/rust/lance-index/src/vector/pq.rs +++ b/rust/lance-index/src/vector/pq.rs @@ -49,7 +49,8 @@ pub struct ProductQuantizer { /// Pre-transposed L2 targets per sub-vector for fast f32 L2 batch computation. /// Only populated when codebook is f32 and distance_type is L2 /// (Cosine is converted to L2 before construction, so it benefits too). - l2_targets: Option>, + /// Wrapped in Arc so all clones (one per cached IVF partition) share one copy. + l2_targets: Option>>, } impl DeepSizeOf for ProductQuantizer { @@ -59,10 +60,9 @@ impl DeepSizeOf for ProductQuantizer { + self.num_bits.deep_size_of_children(_context) + self.dimension.deep_size_of_children(_context) + self.distance_type.deep_size_of_children(_context) - + self - .l2_targets - .as_ref() - .map_or(0, |v| v.iter().map(|t| t.size_bytes()).sum()) + // deep_size_of_children on the Arc de-duplicates shared allocations + // via the context, so partitions sharing one l2_targets are counted once. + + self.l2_targets.deep_size_of_children(_context) } } @@ -74,7 +74,7 @@ impl ProductQuantizer { num_sub_vectors: usize, num_bits: u32, dimension: usize, - ) -> Option> { + ) -> Option>> { if codebook.value_type() != DataType::Float32 || distance_type != DistanceType::L2 { return None; } @@ -86,14 +86,14 @@ impl ProductQuantizer { let num_centroids = 2_usize.pow(num_bits); let block_size = sub_dim * num_centroids; - let targets = (0..num_sub_vectors) + let targets: Vec = (0..num_sub_vectors) .map(|sub_idx| { let block_start = sub_idx * block_size; let block = &values[block_start..block_start + block_size]; L2Prepared::new(block, sub_dim) }) .collect(); - Some(targets) + Some(Arc::new(targets)) } pub fn new( @@ -361,7 +361,7 @@ impl ProductQuantizer { DataType::Float32 => { if let Some(targets) = &self.l2_targets { let query = key.as_primitive::().values(); - Ok(build_distance_table_l2_prepared(targets, query)) + Ok(build_distance_table_l2_prepared(targets.as_slice(), query)) } else { Ok(self .build_l2_distance_table_impl::(key.as_primitive())) diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs index 9aa5de6b9c5..036b54de8d1 100644 --- a/rust/lance-linalg/src/distance/l2.rs +++ b/rust/lance-linalg/src/distance/l2.rs @@ -15,6 +15,7 @@ use arrow_array::{ types::{Float16Type, Float32Type, Float64Type, Int8Type}, }; use arrow_schema::DataType; +use deepsize::DeepSizeOf; use half::{bf16, f16}; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray}; use lance_core::assume_eq; @@ -331,7 +332,7 @@ fn accumulate_l2_dimension(q: f32, row: &[f32], result: &mut [f32]) { /// sub-vector codebooks (e.g. 256 centroids × 16 dims = 16 KB). /// For large target sets the SoA layout causes L1 thrashing and /// [`l2_distance_batch`] with its AoS per-target locality is faster. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, DeepSizeOf)] pub struct L2Prepared { transposed: Vec, dimension: usize, From 507ecc29a6db7e31e8b9da0753bb7aa79d7785b0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 4 Jun 2026 14:59:05 -0700 Subject: [PATCH 024/177] fix: cap exec-node parallelism to DataFusion target_partitions (#7087) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `FilteredReadExec`, `LanceScanExec`, and `ANNIvfPartitionExec` each managed their own concurrency using process-wide CPU counts (`get_num_compute_intensive_cpus()`, `io_parallelism`), ignoring DataFusion's `target_partitions` session config. This makes it impossible to constrain query CPU usage in multi-tenant scenarios even when the caller sets `target_partitions` on the session. Changes: - **`FilteredReadExec`**: cap `OnePartitionMultipleThreads` num_threads with `target_partitions` in `obtain_stream()` - **`LanceScanExec`**: add `parallelism_cap: Option` to `LanceScanConfig`; set it from `target_partitions` in `execute()`; apply it to the CPU decode `try_buffered` (v2) and `batch_readahead` (v1) — IO-bound paths (`frag_parallelism`, `fragment_readahead`) are intentionally not capped - **`ANNIvfPartitionExec`**: cap delta index fan-out `.buffered()` with `target_partitions` - **`ANNIvfSubIndexExec`**: thread `target_partitions` through `initial_search`/`late_search` into `effective_query_parallelism()`, where it caps `get_num_compute_intensive_cpus()` before computing partition search parallelism ## Does this change default parallelism? No. `get_num_compute_intensive_cpus()` returns `num_cpus::get() - IO_CORE_RESERVATION` (default reservation = 2), where `num_cpus::get()` on Linux already reads cgroup CPU limits. DataFusion's default `target_partitions` is `available_parallelism()`, which is also cgroup-aware and returns the same logical CPU count. Since `cpus - 2 ≤ cpus`, `min(get_num_compute_intensive_cpus(), target_partitions)` equals `get_num_compute_intensive_cpus()` — the existing value — in all configurations. The cap only takes effect when a caller explicitly lowers `target_partitions` below the default, which is exactly the multi-tenant use case this change is intended to support. Closes #7082 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance/src/dataset/scanner.rs | 1 + rust/lance/src/io/exec/filtered_read.rs | 41 +++++++++++++- rust/lance/src/io/exec/knn.rs | 60 ++++++++++++++++++-- rust/lance/src/io/exec/scan.rs | 74 +++++++++++++++++++++++-- 4 files changed, 163 insertions(+), 13 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 3bdeee28f05..74f2619f7c2 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4248,6 +4248,7 @@ impl Scanner { with_make_deletions_null, ordered_output: ordered, file_reader_options: self.resolved_file_reader_options(), + parallelism_cap: None, }; Arc::new(LanceScanExec::new( self.dataset.clone(), diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs index d0f0b229cb2..b50cba1f9ce 100644 --- a/rust/lance/src/io/exec/filtered_read.rs +++ b/rust/lance/src/io/exec/filtered_read.rs @@ -1739,7 +1739,13 @@ impl FilteredReadExec { // Second, multiple partitions all share the same underlying task stream (see get_stream) let running_stream_lock = self.running_stream.clone(); let dataset = self.dataset.clone(); - let options = self.options.clone(); + let target_partitions = context.session_config().target_partitions(); + let mut options = self.options.clone(); + if let FilteredReadThreadingMode::OnePartitionMultipleThreads(n) = options.threading_mode { + options.threading_mode = FilteredReadThreadingMode::OnePartitionMultipleThreads( + n.min(target_partitions).max(1), + ); + } let batch_size_bytes = options .file_reader_options .as_ref() @@ -3854,4 +3860,37 @@ mod tests { assert_eq!(result1.column(i).as_ref(), result3.column(i).as_ref()); } } + + /// Verify that executing with target_partitions=1 produces the same results as the default + /// context and does not panic. This is a regression guard for the parallelism cap. + #[test_log::test(tokio::test)] + async fn test_target_partitions_cap_produces_correct_results() { + use datafusion::prelude::SessionConfig; + + let fixture = TestFixture::new().await; + + let options = FilteredReadOptions::basic_full_read(&fixture.dataset); + let plan = + FilteredReadExec::try_new(fixture.dataset.clone(), options.clone(), None).unwrap(); + + // Execute with default context (high thread count) + let default_ctx = Arc::new(TaskContext::default()); + let stream = plan.execute(0, default_ctx).unwrap(); + let schema = stream.schema(); + let batches = stream.try_collect::>().await.unwrap(); + let default_result = concat_batches(&schema, &batches).unwrap(); + + // Execute fresh plan with target_partitions=1 + let plan2 = FilteredReadExec::try_new(fixture.dataset.clone(), options, None).unwrap(); + let low_ctx = Arc::new( + TaskContext::default() + .with_session_config(SessionConfig::default().with_target_partitions(1)), + ); + let stream2 = plan2.execute(0, low_ctx).unwrap(); + let schema2 = stream2.schema(); + let batches2 = stream2.try_collect::>().await.unwrap(); + let capped_result = concat_batches(&schema2, &batches2).unwrap(); + + assert_eq!(default_result.num_rows(), capped_result.num_rows()); + } } diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 71239b4e34b..860e4322ebf 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -875,10 +875,11 @@ impl ExecutionPlan for ANNIvfPartitionExec { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> DataFusionResult { let timer = Instant::now(); + let target_partitions = context.session_config().target_partitions(); let query = self.query.clone(); let ds = self.dataset.clone(); let metrics = Arc::new(AnnPartitionMetrics::new(&self.metrics, partition)); @@ -927,7 +928,7 @@ impl ExecutionPlan for ANNIvfPartitionExec { Ok::<_, DataFusionError>(batch) } }) - .buffered(self.index_uuids.len()) + .buffered(self.index_uuids.len().min(target_partitions).max(1)) .finally(move || { metrics_clone.baseline_metrics.done(); metrics_clone @@ -1148,8 +1149,14 @@ impl PartitionSearchControl for LatePartitionSearchControl { } } -fn effective_query_parallelism(query: &Query, index: &dyn VectorIndex) -> usize { - let cpu_pool_size = get_num_compute_intensive_cpus(); +fn effective_query_parallelism( + query: &Query, + index: &dyn VectorIndex, + target_partitions: usize, +) -> usize { + let cpu_pool_size = get_num_compute_intensive_cpus() + .min(target_partitions) + .max(1); effective_query_parallelism_for( query, cpu_pool_size, @@ -1211,6 +1218,7 @@ impl ANNIvfSubIndexExec { .boxed() } + #[allow(clippy::too_many_arguments)] fn late_search( index: Arc, query: Query, @@ -1219,6 +1227,7 @@ impl ANNIvfSubIndexExec { prefilter: Arc, metrics: Arc, state: Arc, + target_partitions: usize, ) -> impl Stream> { let stream = futures::stream::once(async move { let max_nprobes = query @@ -1288,7 +1297,8 @@ impl ANNIvfSubIndexExec { let state_clone = state.clone(); - let query_parallelism = effective_query_parallelism(&query, index.as_ref()); + let query_parallelism = + effective_query_parallelism(&query, index.as_ref(), target_partitions); if query_parallelism <= 1 { return stream::once(async move { let prefilter: Arc = prefilter; @@ -1359,6 +1369,7 @@ impl ANNIvfSubIndexExec { stream.flatten() } + #[allow(clippy::too_many_arguments)] fn initial_search( index: Arc, query: Query, @@ -1367,10 +1378,12 @@ impl ANNIvfSubIndexExec { prefilter: Arc, metrics: Arc, state: Arc, + target_partitions: usize, ) -> impl Stream> { let minimum_nprobes = query.minimum_nprobes.min(partitions.len()); - let query_parallelism = effective_query_parallelism(&query, index.as_ref()); + let query_parallelism = + effective_query_parallelism(&query, index.as_ref(), target_partitions); if query_parallelism <= 1 { metrics.partitions_searched.add(minimum_nprobes); return stream::once(async move { @@ -1502,6 +1515,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { ) -> DataFusionResult { let input_stream = self.input.execute(partition, context.clone())?; let schema = self.schema(); + let target_partitions = context.session_config().target_partitions(); let query = self.query.clone(); let ds = self.dataset.clone(); let column = self.query.column.clone(); @@ -1593,6 +1607,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { pre_filter.clone(), metrics.clone(), state.clone(), + target_partitions, ); let late_search = Self::late_search( raw_index.clone(), @@ -1602,6 +1617,7 @@ impl ExecutionPlan for ANNIvfSubIndexExec { pre_filter, metrics, state, + target_partitions, ); DataFusionResult::Ok(early_search.chain(late_search).boxed()) } @@ -1943,6 +1959,36 @@ mod tests { assert_eq!(effective_query_parallelism_for(&query, 16, 1), 16); } + #[test] + fn test_effective_query_parallelism_respects_target_partitions() { + // effective_query_parallelism caps cpu_pool_size at target_partitions before + // passing it to effective_query_parallelism_for, so the ceiling is + // min(cpu_pool_size, target_partitions). + let mut query = base_query(); + let cpu_pool_size = 16; + + // use-all-cpus mode: capped at target_partitions + query.query_parallelism = -1; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 1), + 4 + ); + + // auto mode: auto_parallelism also clamped to the reduced cpu_pool_size + query.query_parallelism = 0; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 8), + 4 + ); + + // explicit parallelism > target_partitions: clamped down + query.query_parallelism = 16; + assert_eq!( + effective_query_parallelism_for(&query, cpu_pool_size.min(4), 1), + 4 + ); + } + #[derive(Debug, DeepSizeOf)] struct ThreadCapturingIndex { thread_name: Arc>>, @@ -2444,6 +2490,7 @@ mod tests { empty_prefilter().await, prepared_metrics(), state, + usize::MAX, ) .try_collect::>() .await @@ -2492,6 +2539,7 @@ mod tests { empty_prefilter().await, prepared_metrics(), state.clone(), + usize::MAX, ) .try_collect::>() .await diff --git a/rust/lance/src/io/exec/scan.rs b/rust/lance/src/io/exec/scan.rs index 15d8d181eab..3ec63ce04cc 100644 --- a/rust/lance/src/io/exec/scan.rs +++ b/rust/lance/src/io/exec/scan.rs @@ -342,7 +342,11 @@ impl LanceStream { // TODO: Ideally this will eventually get tied into datafusion as a # of partitions. This will let // us fully fuse decode into the first half of the plan. Currently there is likely to be a thread // transfer between the two steps. - .try_buffered(get_num_compute_intensive_cpus()) + .try_buffered( + get_num_compute_intensive_cpus() + .min(config.parallelism_cap.unwrap_or(usize::MAX)) + .max(1), + ) .stream_in_current_span() .boxed(); @@ -371,9 +375,13 @@ impl LanceStream { let fragment_readahead = config .fragment_readahead .unwrap_or(LEGACY_DEFAULT_FRAGMENT_READAHEAD); + let batch_readahead = config + .batch_readahead + .min(config.parallelism_cap.unwrap_or(usize::MAX)) + .max(1); debug!( "Scanning v1 dataset with frag_readahead={} and batch_readahead={}", - fragment_readahead, config.batch_readahead + fragment_readahead, batch_readahead ); let file_fragments = fragments @@ -410,7 +418,7 @@ impl LanceStream { // We must be waiting to finish a file before moving onto thenext. That's an issue. .try_flatten() // We buffer up to `batch_readahead` batches across all streams. - .try_buffered(config.batch_readahead) + .try_buffered(batch_readahead) .stream_in_current_span() .boxed() } else { @@ -443,7 +451,7 @@ impl LanceStream { tasks .try_flatten_unordered(config.fragment_readahead) // We buffer up to `batch_readahead` batches across all streams. - .try_buffer_unordered(config.batch_readahead) + .try_buffer_unordered(batch_readahead) .stream_in_current_span() .boxed() }; @@ -508,6 +516,9 @@ pub struct LanceScanConfig { pub with_make_deletions_null: bool, pub ordered_output: bool, pub file_reader_options: Option, + /// Upper bound on frag_parallelism and CPU decode concurrency. Set from + /// DataFusion's `target_partitions` session config in `LanceScanExec::execute`. + pub parallelism_cap: Option, } // This is mostly for testing purposes, end users are unlikely to create this @@ -526,6 +537,7 @@ impl Default for LanceScanConfig { with_make_deletions_null: false, ordered_output: false, file_reader_options: None, + parallelism_cap: None, } } } @@ -690,13 +702,17 @@ impl ExecutionPlan for LanceScanExec { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> Result { let dataset = self.dataset.clone(); let fragments = self.fragments.clone(); let range = self.range.clone(); let projection = self.projection.clone(); - let config = self.config.clone(); + let target_partitions = context.session_config().target_partitions(); + let config = LanceScanConfig { + parallelism_cap: Some(target_partitions), + ..self.config.clone() + }; let metrics = self.metrics.clone(); let lance_fut_stream = stream::once(async move { @@ -750,6 +766,9 @@ impl ExecutionPlan for LanceScanExec { #[cfg(test)] mod tests { use datafusion::execution::TaskContext; + use datafusion::prelude::SessionConfig; + use futures::TryStreamExt; + use lance_datagen::gen_batch; use crate::utils::test::NoContextTestFixture; @@ -772,4 +791,47 @@ mod tests { scan.execute(0, Arc::new(TaskContext::default())).unwrap(); } + + /// Verify that executing with target_partitions=1 produces the same row count as the + /// default context. Regression guard for the parallelism cap. + #[tokio::test] + async fn test_target_partitions_cap_produces_correct_results() { + use lance_core::utils::tempfile::TempStrDir; + use lance_datagen::{Dimension, array}; + + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; + + let tmp = TempStrDir::default(); + let dataset = gen_batch() + .col("x", array::step::()) + .col( + "v", + array::rand_vec::(Dimension::from(4)), + ) + .into_dataset( + tmp.as_str(), + FragmentCount::from(4), + FragmentRowCount::from(100), + ) + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let scan = LanceScanExec::new( + dataset.clone(), + dataset.fragments().clone(), + None, + Arc::new(dataset.schema().clone()), + LanceScanConfig::default(), + ); + + let low_ctx = Arc::new( + TaskContext::default() + .with_session_config(SessionConfig::default().with_target_partitions(1)), + ); + let stream = scan.execute(0, low_ctx).unwrap(); + let batches = stream.try_collect::>().await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 400); + } } From dd7201cdb2cf84af6f5fd2dd0f548c1c94962ec3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:29:49 -0700 Subject: [PATCH 025/177] chore(deps): bump aiohttp from 3.13.4 to 3.14.0 in /python (#7090) [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=aiohttp&package-manager=uv&previous-version=3.13.4&new-version=3.14.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/lance-format/lance/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/uv.lock | 244 ++++++++++++++++++++++++------------------------- 1 file changed, 122 insertions(+), 122 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index fba7efc874f..314417f5aa1 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -30,7 +30,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.4" +version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs", marker = "python_full_version >= '3.10'" }, @@ -40,129 +40,129 @@ dependencies = [ { name = "frozenlist", marker = "python_full_version >= '3.10'" }, { name = "multidict", marker = "python_full_version >= '3.10'" }, { name = "propcache", marker = "python_full_version >= '3.10'" }, + { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, { name = "yarl", marker = "python_full_version >= '3.10'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/45/4a/064321452809dae953c1ed6e017504e72551a26b6f5708a5a80e4bf556ff/aiohttp-3.13.4.tar.gz", hash = "sha256:d97a6d09c66087890c2ab5d49069e1e570583f7ac0314ecf98294c1b6aaebd38", size = 7859748, upload-time = "2026-03-28T17:19:40.6Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/05/6817e0390eb47b0867cf8efdb535298191662192281bc3ca62a0cb7973eb/aiohttp-3.13.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6290fe12fe8cefa6ea3c1c5b969d32c010dfe191d4392ff9b599a3f473cbe722", size = 753094, upload-time = "2026-03-28T17:14:59.928Z" }, - { url = "https://files.pythonhosted.org/packages/b4/c1/e5b7f25f6dd1ab57da92aa9d226b2c8b56f223dd20475d3ddfddaba86ab8/aiohttp-3.13.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7520d92c0e8fbbe63f36f20a5762db349ff574ad38ad7bc7732558a650439845", size = 505213, upload-time = "2026-03-28T17:15:01.989Z" }, - { url = "https://files.pythonhosted.org/packages/b4/e5/8f42033c7ce98b54dfd3791f03e60231cfe4a2db4471b5fc188df2b8a6ad/aiohttp-3.13.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d2710ae1e1b81d0f187883b6e9d66cecf8794b50e91aa1e73fc78bfb5503b5d9", size = 498580, upload-time = "2026-03-28T17:15:03.879Z" }, - { url = "https://files.pythonhosted.org/packages/8c/a4/bbc989f5362066b81930da1a66084a859a971d03faab799dc59a3ce3a220/aiohttp-3.13.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:717d17347567ded1e273aa09918650dfd6fd06f461549204570c7973537d4123", size = 1692718, upload-time = "2026-03-28T17:15:05.541Z" }, - { url = "https://files.pythonhosted.org/packages/1c/72/3775116969931f151be116689d2ae6ddafff2ec2887d8f9b4e7043f32e74/aiohttp-3.13.4-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:383880f7b8de5ac208fa829c7038d08e66377283b2de9e791b71e06e803153c2", size = 1660714, upload-time = "2026-03-28T17:15:08.23Z" }, - { url = "https://files.pythonhosted.org/packages/a1/e8/d2f1a2da2743e32fe348ebf8a4c59caad14a92f5f18af616fd33381275e1/aiohttp-3.13.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1867087e2c1963db1216aedf001efe3b129835ed2b05d97d058176a6d08b5726", size = 1744152, upload-time = "2026-03-28T17:15:10.828Z" }, - { url = "https://files.pythonhosted.org/packages/4c/a6/575886f417ac3c08e462f2ca237cc49f436bd992ca3f7ff95b7dd9c44205/aiohttp-3.13.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6234bf416a38d687c3ab7f79934d7fb2a42117a5b9813aca07de0a5398489023", size = 1836278, upload-time = "2026-03-28T17:15:12.537Z" }, - { url = "https://files.pythonhosted.org/packages/4a/4c/0051d4550fb9e8b5ca4e0fe1ccd58652340915180c5164999e6741bf2083/aiohttp-3.13.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3cdd3393130bf6588962441ffd5bde1d3ea2d63a64afa7119b3f3ba349cebbe7", size = 1687953, upload-time = "2026-03-28T17:15:14.248Z" }, - { url = "https://files.pythonhosted.org/packages/c9/54/841e87b8c51c2adc01a3ceb9919dc45c7899fe4c21deb70aada734ea5a38/aiohttp-3.13.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0d0dbc6c76befa76865373d6aa303e480bb8c3486e7763530f7f6e527b471118", size = 1572484, upload-time = "2026-03-28T17:15:15.911Z" }, - { url = "https://files.pythonhosted.org/packages/da/f1/21cbf5f7fa1e267af6301f886cab9b314f085e4d0097668d189d165cd7da/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10fb7b53262cf4144a083c9db0d2b4d22823d6708270a9970c4627b248c6064c", size = 1662851, upload-time = "2026-03-28T17:15:17.822Z" }, - { url = "https://files.pythonhosted.org/packages/40/15/bcad6b68d7bef27ae7443288215767263c7753ede164267cf6cf63c94a87/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:eb10ce8c03850e77f4d9518961c227be569e12f71525a7e90d17bca04299921d", size = 1671984, upload-time = "2026-03-28T17:15:19.561Z" }, - { url = "https://files.pythonhosted.org/packages/ff/fa/ab316931afc7a73c7f493bb1b30fbd61e28ec2d3ea50353336e76293e8ec/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:7c65738ac5ae32b8feef699a4ed0dc91a0c8618b347781b7461458bbcaaac7eb", size = 1713880, upload-time = "2026-03-28T17:15:21.589Z" }, - { url = "https://files.pythonhosted.org/packages/1c/45/314e8e64c7f328174964b6db511dd5e9e60c9121ab5457bc2c908b7d03a4/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:6b335919ffbaf98df8ff3c74f7a6decb8775882632952fd1810a017e38f15aee", size = 1560315, upload-time = "2026-03-28T17:15:23.66Z" }, - { url = "https://files.pythonhosted.org/packages/18/e7/93d5fa06fe00219a81466577dacae9e3732f3b4f767b12b2e2cc8c35c970/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ec75fc18cb9f4aca51c2cbace20cf6716e36850f44189644d2d69a875d5e0532", size = 1735115, upload-time = "2026-03-28T17:15:25.77Z" }, - { url = "https://files.pythonhosted.org/packages/19/9f/f64b95392ddd4e204fd9ab7cd33dd18d14ac9e4b86866f1f6a69b7cda83d/aiohttp-3.13.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:463fa18a95c5a635d2b8c09babe240f9d7dbf2a2010a6c0b35d8c4dff2a0e819", size = 1673916, upload-time = "2026-03-28T17:15:27.526Z" }, - { url = "https://files.pythonhosted.org/packages/52/c1/bb33be79fd285c69f32e5b074b299cae8847f748950149c3965c1b3b3adf/aiohttp-3.13.4-cp310-cp310-win32.whl", hash = "sha256:13168f5645d9045522c6cef818f54295376257ed8d02513a37c2ef3046fc7a97", size = 440277, upload-time = "2026-03-28T17:15:29.173Z" }, - { url = "https://files.pythonhosted.org/packages/23/f9/7cf1688da4dd0885f914ee40bc8e1dce776df98fe6518766de975a570538/aiohttp-3.13.4-cp310-cp310-win_amd64.whl", hash = "sha256:a7058af1f53209fdf07745579ced525d38d481650a989b7aa4a3b484b901cdab", size = 463015, upload-time = "2026-03-28T17:15:30.802Z" }, - { url = "https://files.pythonhosted.org/packages/d4/7e/cb94129302d78c46662b47f9897d642fd0b33bdfef4b73b20c6ced35aa4c/aiohttp-3.13.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8ea0c64d1bcbf201b285c2246c51a0c035ba3bbd306640007bc5844a3b4658c1", size = 760027, upload-time = "2026-03-28T17:15:33.022Z" }, - { url = "https://files.pythonhosted.org/packages/5e/cd/2db3c9397c3bd24216b203dd739945b04f8b87bb036c640da7ddb63c75ef/aiohttp-3.13.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6f742e1fa45c0ed522b00ede565e18f97e4cf8d1883a712ac42d0339dfb0cce7", size = 508325, upload-time = "2026-03-28T17:15:34.714Z" }, - { url = "https://files.pythonhosted.org/packages/36/a3/d28b2722ec13107f2e37a86b8a169897308bab6a3b9e071ecead9d67bd9b/aiohttp-3.13.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dcfb50ee25b3b7a1222a9123be1f9f89e56e67636b561441f0b304e25aaef8f", size = 502402, upload-time = "2026-03-28T17:15:36.409Z" }, - { url = "https://files.pythonhosted.org/packages/fa/d6/acd47b5f17c4430e555590990a4746efbcb2079909bb865516892bf85f37/aiohttp-3.13.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3262386c4ff370849863ea93b9ea60fd59c6cf56bf8f93beac625cf4d677c04d", size = 1771224, upload-time = "2026-03-28T17:15:38.223Z" }, - { url = "https://files.pythonhosted.org/packages/98/af/af6e20113ba6a48fd1cd9e5832c4851e7613ef50c7619acdaee6ec5f1aff/aiohttp-3.13.4-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:473bb5aa4218dd254e9ae4834f20e31f5a0083064ac0136a01a62ddbae2eaa42", size = 1731530, upload-time = "2026-03-28T17:15:39.988Z" }, - { url = "https://files.pythonhosted.org/packages/81/16/78a2f5d9c124ad05d5ce59a9af94214b6466c3491a25fb70760e98e9f762/aiohttp-3.13.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e56423766399b4c77b965f6aaab6c9546617b8994a956821cc507d00b91d978c", size = 1827925, upload-time = "2026-03-28T17:15:41.944Z" }, - { url = "https://files.pythonhosted.org/packages/2a/1f/79acf0974ced805e0e70027389fccbb7d728e6f30fcac725fb1071e63075/aiohttp-3.13.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8af249343fafd5ad90366a16d230fc265cf1149f26075dc9fe93cfd7c7173942", size = 1923579, upload-time = "2026-03-28T17:15:44.071Z" }, - { url = "https://files.pythonhosted.org/packages/af/53/29f9e2054ea6900413f3b4c3eb9d8331f60678ec855f13ba8714c47fd48d/aiohttp-3.13.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bc0a5cf4f10ef5a2c94fdde488734b582a3a7a000b131263e27c9295bd682d9", size = 1767655, upload-time = "2026-03-28T17:15:45.911Z" }, - { url = "https://files.pythonhosted.org/packages/f3/57/462fe1d3da08109ba4aa8590e7aed57c059af2a7e80ec21f4bac5cfe1094/aiohttp-3.13.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5c7ff1028e3c9fc5123a865ce17df1cb6424d180c503b8517afbe89aa566e6be", size = 1630439, upload-time = "2026-03-28T17:15:48.11Z" }, - { url = "https://files.pythonhosted.org/packages/d7/4b/4813344aacdb8127263e3eec343d24e973421143826364fa9fc847f6283f/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ba5cf98b5dcb9bddd857da6713a503fa6d341043258ca823f0f5ab7ab4a94ee8", size = 1745557, upload-time = "2026-03-28T17:15:50.13Z" }, - { url = "https://files.pythonhosted.org/packages/d4/01/1ef1adae1454341ec50a789f03cfafe4c4ac9c003f6a64515ecd32fe4210/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d85965d3ba21ee4999e83e992fecb86c4614d6920e40705501c0a1f80a583c12", size = 1741796, upload-time = "2026-03-28T17:15:52.351Z" }, - { url = "https://files.pythonhosted.org/packages/22/04/8cdd99af988d2aa6922714d957d21383c559835cbd43fbf5a47ddf2e0f05/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:49f0b18a9b05d79f6f37ddd567695943fcefb834ef480f17a4211987302b2dc7", size = 1805312, upload-time = "2026-03-28T17:15:54.407Z" }, - { url = "https://files.pythonhosted.org/packages/fb/7f/b48d5577338d4b25bbdbae35c75dbfd0493cb8886dc586fbfb2e90862239/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7f78cb080c86fbf765920e5f1ef35af3f24ec4314d6675d0a21eaf41f6f2679c", size = 1621751, upload-time = "2026-03-28T17:15:56.564Z" }, - { url = "https://files.pythonhosted.org/packages/bc/89/4eecad8c1858e6d0893c05929e22343e0ebe3aec29a8a399c65c3cc38311/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:67a3ec705534a614b68bbf1c70efa777a21c3da3895d1c44510a41f5a7ae0453", size = 1826073, upload-time = "2026-03-28T17:15:58.489Z" }, - { url = "https://files.pythonhosted.org/packages/f5/5c/9dc8293ed31b46c39c9c513ac7ca152b3c3d38e0ea111a530ad12001b827/aiohttp-3.13.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d6630ec917e85c5356b2295744c8a97d40f007f96a1c76bf1928dc2e27465393", size = 1760083, upload-time = "2026-03-28T17:16:00.677Z" }, - { url = "https://files.pythonhosted.org/packages/1e/19/8bbf6a4994205d96831f97b7d21a0feed120136e6267b5b22d229c6dc4dc/aiohttp-3.13.4-cp311-cp311-win32.whl", hash = "sha256:54049021bc626f53a5394c29e8c444f726ee5a14b6e89e0ad118315b1f90f5e3", size = 439690, upload-time = "2026-03-28T17:16:02.902Z" }, - { url = "https://files.pythonhosted.org/packages/0c/f5/ac409ecd1007528d15c3e8c3a57d34f334c70d76cfb7128a28cffdebd4c1/aiohttp-3.13.4-cp311-cp311-win_amd64.whl", hash = "sha256:c033f2bc964156030772d31cbf7e5defea181238ce1f87b9455b786de7d30145", size = 463824, upload-time = "2026-03-28T17:16:05.058Z" }, - { url = "https://files.pythonhosted.org/packages/1e/bd/ede278648914cabbabfdf95e436679b5d4156e417896a9b9f4587169e376/aiohttp-3.13.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ee62d4471ce86b108b19c3364db4b91180d13fe3510144872d6bad5401957360", size = 752158, upload-time = "2026-03-28T17:16:06.901Z" }, - { url = "https://files.pythonhosted.org/packages/90/de/581c053253c07b480b03785196ca5335e3c606a37dc73e95f6527f1591fe/aiohttp-3.13.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c0fd8f41b54b58636402eb493afd512c23580456f022c1ba2db0f810c959ed0d", size = 501037, upload-time = "2026-03-28T17:16:08.82Z" }, - { url = "https://files.pythonhosted.org/packages/fa/f9/a5ede193c08f13cc42c0a5b50d1e246ecee9115e4cf6e900d8dbd8fd6acb/aiohttp-3.13.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4baa48ce49efd82d6b1a0be12d6a36b35e5594d1dd42f8bfba96ea9f8678b88c", size = 501556, upload-time = "2026-03-28T17:16:10.63Z" }, - { url = "https://files.pythonhosted.org/packages/d6/10/88ff67cd48a6ec36335b63a640abe86135791544863e0cfe1f065d6cef7a/aiohttp-3.13.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d738ebab9f71ee652d9dbd0211057690022201b11197f9a7324fd4dba128aa97", size = 1757314, upload-time = "2026-03-28T17:16:12.498Z" }, - { url = "https://files.pythonhosted.org/packages/8b/15/fdb90a5cf5a1f52845c276e76298c75fbbcc0ac2b4a86551906d54529965/aiohttp-3.13.4-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0ce692c3468fa831af7dceed52edf51ac348cebfc8d3feb935927b63bd3e8576", size = 1731819, upload-time = "2026-03-28T17:16:14.558Z" }, - { url = "https://files.pythonhosted.org/packages/ec/df/28146785a007f7820416be05d4f28cc207493efd1e8c6c1068e9bdc29198/aiohttp-3.13.4-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8e08abcfe752a454d2cb89ff0c08f2d1ecd057ae3e8cc6d84638de853530ebab", size = 1793279, upload-time = "2026-03-28T17:16:16.594Z" }, - { url = "https://files.pythonhosted.org/packages/10/47/689c743abf62ea7a77774d5722f220e2c912a77d65d368b884d9779ef41b/aiohttp-3.13.4-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5977f701b3fff36367a11087f30ea73c212e686d41cd363c50c022d48b011d8d", size = 1891082, upload-time = "2026-03-28T17:16:18.71Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b6/f7f4f318c7e58c23b761c9b13b9a3c9b394e0f9d5d76fbc6622fa98509f6/aiohttp-3.13.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:54203e10405c06f8b6020bd1e076ae0fe6c194adcee12a5a78af3ffa3c57025e", size = 1773938, upload-time = "2026-03-28T17:16:21.125Z" }, - { url = "https://files.pythonhosted.org/packages/aa/06/f207cb3121852c989586a6fc16ff854c4fcc8651b86c5d3bd1fc83057650/aiohttp-3.13.4-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:358a6af0145bc4dda037f13167bef3cce54b132087acc4c295c739d05d16b1c3", size = 1579548, upload-time = "2026-03-28T17:16:23.588Z" }, - { url = "https://files.pythonhosted.org/packages/6c/58/e1289661a32161e24c1fe479711d783067210d266842523752869cc1d9c2/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:898ea1850656d7d61832ef06aa9846ab3ddb1621b74f46de78fbc5e1a586ba83", size = 1714669, upload-time = "2026-03-28T17:16:25.713Z" }, - { url = "https://files.pythonhosted.org/packages/96/0a/3e86d039438a74a86e6a948a9119b22540bae037d6ba317a042ae3c22711/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7bc30cceb710cf6a44e9617e43eebb6e3e43ad855a34da7b4b6a73537d8a6763", size = 1754175, upload-time = "2026-03-28T17:16:28.18Z" }, - { url = "https://files.pythonhosted.org/packages/f4/30/e717fc5df83133ba467a560b6d8ef20197037b4bb5d7075b90037de1018e/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4a31c0c587a8a038f19a4c7e60654a6c899c9de9174593a13e7cc6e15ff271f9", size = 1762049, upload-time = "2026-03-28T17:16:30.941Z" }, - { url = "https://files.pythonhosted.org/packages/e4/28/8f7a2d4492e336e40005151bdd94baf344880a4707573378579f833a64c1/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:2062f675f3fe6e06d6113eb74a157fb9df58953ffed0cdb4182554b116545758", size = 1570861, upload-time = "2026-03-28T17:16:32.953Z" }, - { url = "https://files.pythonhosted.org/packages/78/45/12e1a3d0645968b1c38de4b23fdf270b8637735ea057d4f84482ff918ad9/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d1ba8afb847ff80626d5e408c1fdc99f942acc877d0702fe137015903a220a9", size = 1790003, upload-time = "2026-03-28T17:16:35.468Z" }, - { url = "https://files.pythonhosted.org/packages/eb/0f/60374e18d590de16dcb39d6ff62f39c096c1b958e6f37727b5870026ea30/aiohttp-3.13.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b08149419994cdd4d5eecf7fd4bc5986b5a9380285bcd01ab4c0d6bfca47b79d", size = 1737289, upload-time = "2026-03-28T17:16:38.187Z" }, - { url = "https://files.pythonhosted.org/packages/02/bf/535e58d886cfbc40a8b0013c974afad24ef7632d645bca0b678b70033a60/aiohttp-3.13.4-cp312-cp312-win32.whl", hash = "sha256:fc432f6a2c4f720180959bc19aa37259651c1a4ed8af8afc84dd41c60f15f791", size = 434185, upload-time = "2026-03-28T17:16:40.735Z" }, - { url = "https://files.pythonhosted.org/packages/1e/1a/d92e3325134ebfff6f4069f270d3aac770d63320bd1fcd0eca023e74d9a8/aiohttp-3.13.4-cp312-cp312-win_amd64.whl", hash = "sha256:6148c9ae97a3e8bff9a1fc9c757fa164116f86c100468339730e717590a3fb77", size = 461285, upload-time = "2026-03-28T17:16:42.713Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ac/892f4162df9b115b4758d615f32ec63d00f3084c705ff5526630887b9b42/aiohttp-3.13.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:63dd5e5b1e43b8fb1e91b79b7ceba1feba588b317d1edff385084fcc7a0a4538", size = 745744, upload-time = "2026-03-28T17:16:44.67Z" }, - { url = "https://files.pythonhosted.org/packages/97/a9/c5b87e4443a2f0ea88cb3000c93a8fdad1ee63bffc9ded8d8c8e0d66efc6/aiohttp-3.13.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:746ac3cc00b5baea424dacddea3ec2c2702f9590de27d837aa67004db1eebc6e", size = 498178, upload-time = "2026-03-28T17:16:46.766Z" }, - { url = "https://files.pythonhosted.org/packages/94/42/07e1b543a61250783650df13da8ddcdc0d0a5538b2bd15cef6e042aefc61/aiohttp-3.13.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bda8f16ea99d6a6705e5946732e48487a448be874e54a4f73d514660ff7c05d3", size = 498331, upload-time = "2026-03-28T17:16:48.9Z" }, - { url = "https://files.pythonhosted.org/packages/20/d6/492f46bf0328534124772d0cf58570acae5b286ea25006900650f69dae0e/aiohttp-3.13.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4b061e7b5f840391e3f64d0ddf672973e45c4cfff7a0feea425ea24e51530fc2", size = 1744414, upload-time = "2026-03-28T17:16:50.968Z" }, - { url = "https://files.pythonhosted.org/packages/e2/4d/e02627b2683f68051246215d2d62b2d2f249ff7a285e7a858dc47d6b6a14/aiohttp-3.13.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b252e8d5cd66184b570d0d010de742736e8a4fab22c58299772b0c5a466d4b21", size = 1719226, upload-time = "2026-03-28T17:16:53.173Z" }, - { url = "https://files.pythonhosted.org/packages/7b/6c/5d0a3394dd2b9f9aeba6e1b6065d0439e4b75d41f1fb09a3ec010b43552b/aiohttp-3.13.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20af8aad61d1803ff11152a26146d8d81c266aa8c5aa9b4504432abb965c36a0", size = 1782110, upload-time = "2026-03-28T17:16:55.362Z" }, - { url = "https://files.pythonhosted.org/packages/0d/2d/c20791e3437700a7441a7edfb59731150322424f5aadf635602d1d326101/aiohttp-3.13.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:13a5cc924b59859ad2adb1478e31f410a7ed46e92a2a619d6d1dd1a63c1a855e", size = 1884809, upload-time = "2026-03-28T17:16:57.734Z" }, - { url = "https://files.pythonhosted.org/packages/c8/94/d99dbfbd1924a87ef643833932eb2a3d9e5eee87656efea7d78058539eff/aiohttp-3.13.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:534913dfb0a644d537aebb4123e7d466d94e3be5549205e6a31f72368980a81a", size = 1764938, upload-time = "2026-03-28T17:17:00.221Z" }, - { url = "https://files.pythonhosted.org/packages/49/61/3ce326a1538781deb89f6cf5e094e2029cd308ed1e21b2ba2278b08426f6/aiohttp-3.13.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:320e40192a2dcc1cf4b5576936e9652981ab596bf81eb309535db7e2f5b5672f", size = 1570697, upload-time = "2026-03-28T17:17:02.985Z" }, - { url = "https://files.pythonhosted.org/packages/b6/77/4ab5a546857bb3028fbaf34d6eea180267bdab022ee8b1168b1fcde4bfdd/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9e587fcfce2bcf06526a43cb705bdee21ac089096f2e271d75de9c339db3100c", size = 1702258, upload-time = "2026-03-28T17:17:05.28Z" }, - { url = "https://files.pythonhosted.org/packages/79/63/d8f29021e39bc5af8e5d5e9da1b07976fb9846487a784e11e4f4eeda4666/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9eb9c2eea7278206b5c6c1441fdd9dc420c278ead3f3b2cc87f9b693698cc500", size = 1740287, upload-time = "2026-03-28T17:17:07.712Z" }, - { url = "https://files.pythonhosted.org/packages/55/3a/cbc6b3b124859a11bc8055d3682c26999b393531ef926754a3445b99dfef/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:29be00c51972b04bf9d5c8f2d7f7314f48f96070ca40a873a53056e652e805f7", size = 1753011, upload-time = "2026-03-28T17:17:10.053Z" }, - { url = "https://files.pythonhosted.org/packages/e0/30/836278675205d58c1368b21520eab9572457cf19afd23759216c04483048/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:90c06228a6c3a7c9f776fe4fc0b7ff647fffd3bed93779a6913c804ae00c1073", size = 1566359, upload-time = "2026-03-28T17:17:12.433Z" }, - { url = "https://files.pythonhosted.org/packages/50/b4/8032cc9b82d17e4277704ba30509eaccb39329dc18d6a35f05e424439e32/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a533ec132f05fd9a1d959e7f34184cd7d5e8511584848dab85faefbaac573069", size = 1785537, upload-time = "2026-03-28T17:17:14.721Z" }, - { url = "https://files.pythonhosted.org/packages/17/7d/5873e98230bde59f493bf1f7c3e327486a4b5653fa401144704df5d00211/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1c946f10f413836f82ea4cfb90200d2a59578c549f00857e03111cf45ad01ca5", size = 1740752, upload-time = "2026-03-28T17:17:17.387Z" }, - { url = "https://files.pythonhosted.org/packages/7b/f2/13e46e0df051494d7d3c68b7f72d071f48c384c12716fc294f75d5b1a064/aiohttp-3.13.4-cp313-cp313-win32.whl", hash = "sha256:48708e2706106da6967eff5908c78ca3943f005ed6bcb75da2a7e4da94ef8c70", size = 433187, upload-time = "2026-03-28T17:17:19.523Z" }, - { url = "https://files.pythonhosted.org/packages/ea/c0/649856ee655a843c8f8664592cfccb73ac80ede6a8c8db33a25d810c12db/aiohttp-3.13.4-cp313-cp313-win_amd64.whl", hash = "sha256:74a2eb058da44fa3a877a49e2095b591d4913308bb424c418b77beb160c55ce3", size = 459778, upload-time = "2026-03-28T17:17:21.964Z" }, - { url = "https://files.pythonhosted.org/packages/6d/29/6657cc37ae04cacc2dbf53fb730a06b6091cc4cbe745028e047c53e6d840/aiohttp-3.13.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:e0a2c961fc92abeff61d6444f2ce6ad35bb982db9fc8ff8a47455beacf454a57", size = 749363, upload-time = "2026-03-28T17:17:24.044Z" }, - { url = "https://files.pythonhosted.org/packages/90/7f/30ccdf67ca3d24b610067dc63d64dcb91e5d88e27667811640644aa4a85d/aiohttp-3.13.4-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:153274535985a0ff2bff1fb6c104ed547cec898a09213d21b0f791a44b14d933", size = 499317, upload-time = "2026-03-28T17:17:26.199Z" }, - { url = "https://files.pythonhosted.org/packages/93/13/e372dd4e68ad04ee25dafb050c7f98b0d91ea643f7352757e87231102555/aiohttp-3.13.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:351f3171e2458da3d731ce83f9e6b9619e325c45cbd534c7759750cabf453ad7", size = 500477, upload-time = "2026-03-28T17:17:28.279Z" }, - { url = "https://files.pythonhosted.org/packages/e5/fe/ee6298e8e586096fb6f5eddd31393d8544f33ae0792c71ecbb4c2bef98ac/aiohttp-3.13.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f989ac8bc5595ff761a5ccd32bdb0768a117f36dd1504b1c2c074ed5d3f4df9c", size = 1737227, upload-time = "2026-03-28T17:17:30.587Z" }, - { url = "https://files.pythonhosted.org/packages/b0/b9/a7a0463a09e1a3fe35100f74324f23644bfc3383ac5fd5effe0722a5f0b7/aiohttp-3.13.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d36fc1709110ec1e87a229b201dd3ddc32aa01e98e7868083a794609b081c349", size = 1694036, upload-time = "2026-03-28T17:17:33.29Z" }, - { url = "https://files.pythonhosted.org/packages/57/7c/8972ae3fb7be00a91aee6b644b2a6a909aedb2c425269a3bfd90115e6f8f/aiohttp-3.13.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42adaeea83cbdf069ab94f5103ce0787c21fb1a0153270da76b59d5578302329", size = 1786814, upload-time = "2026-03-28T17:17:36.035Z" }, - { url = "https://files.pythonhosted.org/packages/93/01/c81e97e85c774decbaf0d577de7d848934e8166a3a14ad9f8aa5be329d28/aiohttp-3.13.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:92deb95469928cc41fd4b42a95d8012fa6df93f6b1c0a83af0ffbc4a5e218cde", size = 1866676, upload-time = "2026-03-28T17:17:38.441Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5f/5b46fe8694a639ddea2cd035bf5729e4677ea882cb251396637e2ef1590d/aiohttp-3.13.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0c7c07c4257ef3a1df355f840bc62d133bcdef5c1c5ba75add3c08553e2eed", size = 1740842, upload-time = "2026-03-28T17:17:40.783Z" }, - { url = "https://files.pythonhosted.org/packages/20/a2/0d4b03d011cca6b6b0acba8433193c1e484efa8d705ea58295590fe24203/aiohttp-3.13.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f062c45de8a1098cb137a1898819796a2491aec4e637a06b03f149315dff4d8f", size = 1566508, upload-time = "2026-03-28T17:17:43.235Z" }, - { url = "https://files.pythonhosted.org/packages/98/17/e689fd500da52488ec5f889effd6404dece6a59de301e380f3c64f167beb/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:76093107c531517001114f0ebdb4f46858ce818590363e3e99a4a2280334454a", size = 1700569, upload-time = "2026-03-28T17:17:46.165Z" }, - { url = "https://files.pythonhosted.org/packages/d8/0d/66402894dbcf470ef7db99449e436105ea862c24f7ea4c95c683e635af35/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:6f6ec32162d293b82f8b63a16edc80769662fbd5ae6fbd4936d3206a2c2cc63b", size = 1707407, upload-time = "2026-03-28T17:17:48.825Z" }, - { url = "https://files.pythonhosted.org/packages/2f/eb/af0ab1a3650092cbd8e14ef29e4ab0209e1460e1c299996c3f8288b3f1ff/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5903e2db3d202a00ad9f0ec35a122c005e85d90c9836ab4cda628f01edf425e2", size = 1752214, upload-time = "2026-03-28T17:17:51.206Z" }, - { url = "https://files.pythonhosted.org/packages/5a/bf/72326f8a98e4c666f292f03c385545963cc65e358835d2a7375037a97b57/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2d5bea57be7aca98dbbac8da046d99b5557c5cf4e28538c4c786313078aca09e", size = 1562162, upload-time = "2026-03-28T17:17:53.634Z" }, - { url = "https://files.pythonhosted.org/packages/67/9f/13b72435f99151dd9a5469c96b3b5f86aa29b7e785ca7f35cf5e538f74c0/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bcf0c9902085976edc0232b75006ef38f89686901249ce14226b6877f88464fb", size = 1768904, upload-time = "2026-03-28T17:17:55.991Z" }, - { url = "https://files.pythonhosted.org/packages/18/bc/28d4970e7d5452ac7776cdb5431a1164a0d9cf8bd2fffd67b4fb463aa56d/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3295f98bfeed2e867cab588f2a146a9db37a85e3ae9062abf46ba062bd29165", size = 1723378, upload-time = "2026-03-28T17:17:58.348Z" }, - { url = "https://files.pythonhosted.org/packages/53/74/b32458ca1a7f34d65bdee7aef2036adbe0438123d3d53e2b083c453c24dd/aiohttp-3.13.4-cp314-cp314-win32.whl", hash = "sha256:a598a5c5767e1369d8f5b08695cab1d8160040f796c4416af76fd773d229b3c9", size = 438711, upload-time = "2026-03-28T17:18:00.728Z" }, - { url = "https://files.pythonhosted.org/packages/40/b2/54b487316c2df3e03a8f3435e9636f8a81a42a69d942164830d193beb56a/aiohttp-3.13.4-cp314-cp314-win_amd64.whl", hash = "sha256:c555db4bc7a264bead5a7d63d92d41a1122fcd39cc62a4db815f45ad46f9c2c8", size = 464977, upload-time = "2026-03-28T17:18:03.367Z" }, - { url = "https://files.pythonhosted.org/packages/47/fb/e41b63c6ce71b07a59243bb8f3b457ee0c3402a619acb9d2c0d21ef0e647/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45abbbf09a129825d13c18c7d3182fecd46d9da3cfc383756145394013604ac1", size = 781549, upload-time = "2026-03-28T17:18:05.779Z" }, - { url = "https://files.pythonhosted.org/packages/97/53/532b8d28df1e17e44c4d9a9368b78dcb6bf0b51037522136eced13afa9e8/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:74c80b2bc2c2adb7b3d1941b2b60701ee2af8296fc8aad8b8bc48bc25767266c", size = 514383, upload-time = "2026-03-28T17:18:08.096Z" }, - { url = "https://files.pythonhosted.org/packages/1b/1f/62e5d400603e8468cd635812d99cb81cfdc08127a3dc474c647615f31339/aiohttp-3.13.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c97989ae40a9746650fa196894f317dafc12227c808c774929dda0ff873a5954", size = 518304, upload-time = "2026-03-28T17:18:10.642Z" }, - { url = "https://files.pythonhosted.org/packages/90/57/2326b37b10896447e3c6e0cbef4fe2486d30913639a5cfd1332b5d870f82/aiohttp-3.13.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dae86be9811493f9990ef44fff1685f5c1a3192e9061a71a109d527944eed551", size = 1893433, upload-time = "2026-03-28T17:18:13.121Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b4/a24d82112c304afdb650167ef2fe190957d81cbddac7460bedd245f765aa/aiohttp-3.13.4-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1db491abe852ca2fa6cc48a3341985b0174b3741838e1341b82ac82c8bd9e871", size = 1755901, upload-time = "2026-03-28T17:18:16.21Z" }, - { url = "https://files.pythonhosted.org/packages/9e/2d/0883ef9d878d7846287f036c162a951968f22aabeef3ac97b0bea6f76d5d/aiohttp-3.13.4-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0e5d701c0aad02a7dce72eef6b93226cf3734330f1a31d69ebbf69f33b86666e", size = 1876093, upload-time = "2026-03-28T17:18:18.703Z" }, - { url = "https://files.pythonhosted.org/packages/ad/52/9204bb59c014869b71971addad6778f005daa72a96eed652c496789d7468/aiohttp-3.13.4-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8ac32a189081ae0a10ba18993f10f338ec94341f0d5df8fff348043962f3c6f8", size = 1970815, upload-time = "2026-03-28T17:18:21.858Z" }, - { url = "https://files.pythonhosted.org/packages/d6/b5/e4eb20275a866dde0f570f411b36c6b48f7b53edfe4f4071aa1b0728098a/aiohttp-3.13.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e968cdaba43e45c73c3f306fca418c8009a957733bac85937c9f9cf3f4de27", size = 1816223, upload-time = "2026-03-28T17:18:24.729Z" }, - { url = "https://files.pythonhosted.org/packages/d8/23/e98075c5bb146aa61a1239ee1ac7714c85e814838d6cebbe37d3fe19214a/aiohttp-3.13.4-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca114790c9144c335d538852612d3e43ea0f075288f4849cf4b05d6cd2238ce7", size = 1649145, upload-time = "2026-03-28T17:18:27.269Z" }, - { url = "https://files.pythonhosted.org/packages/d6/c1/7bad8be33bb06c2bb224b6468874346026092762cbec388c3bdb65a368ee/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ea2e071661ba9cfe11eabbc81ac5376eaeb3061f6e72ec4cc86d7cdd1ffbdbbb", size = 1816562, upload-time = "2026-03-28T17:18:29.847Z" }, - { url = "https://files.pythonhosted.org/packages/5c/10/c00323348695e9a5e316825969c88463dcc24c7e9d443244b8a2c9cf2eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:34e89912b6c20e0fd80e07fa401fd218a410aa1ce9f1c2f1dad6db1bd0ce0927", size = 1800333, upload-time = "2026-03-28T17:18:32.269Z" }, - { url = "https://files.pythonhosted.org/packages/84/43/9b2147a1df3559f49bd723e22905b46a46c068a53adb54abdca32c4de180/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0e217cf9f6a42908c52b46e42c568bd57adc39c9286ced31aaace614b6087965", size = 1820617, upload-time = "2026-03-28T17:18:35.238Z" }, - { url = "https://files.pythonhosted.org/packages/a9/7f/b3481a81e7a586d02e99387b18c6dafff41285f6efd3daa2124c01f87eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:0c296f1221e21ba979f5ac1964c3b78cfde15c5c5f855ffd2caab337e9cd9182", size = 1643417, upload-time = "2026-03-28T17:18:37.949Z" }, - { url = "https://files.pythonhosted.org/packages/8f/72/07181226bc99ce1124e0f89280f5221a82d3ae6a6d9d1973ce429d48e52b/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d99a9d168ebaffb74f36d011750e490085ac418f4db926cce3989c8fe6cb6b1b", size = 1849286, upload-time = "2026-03-28T17:18:40.534Z" }, - { url = "https://files.pythonhosted.org/packages/1a/e6/1b3566e103eca6da5be4ae6713e112a053725c584e96574caf117568ffef/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cb19177205d93b881f3f89e6081593676043a6828f59c78c17a0fd6c1fbed2ba", size = 1782635, upload-time = "2026-03-28T17:18:43.073Z" }, - { url = "https://files.pythonhosted.org/packages/37/58/1b11c71904b8d079eb0c39fe664180dd1e14bebe5608e235d8bfbadc8929/aiohttp-3.13.4-cp314-cp314t-win32.whl", hash = "sha256:c606aa5656dab6552e52ca368e43869c916338346bfaf6304e15c58fb113ea30", size = 472537, upload-time = "2026-03-28T17:18:46.286Z" }, - { url = "https://files.pythonhosted.org/packages/bc/8f/87c56a1a1977d7dddea5b31e12189665a140fdb48a71e9038ff90bb564ec/aiohttp-3.13.4-cp314-cp314t-win_amd64.whl", hash = "sha256:014dcc10ec8ab8db681f0d68e939d1e9286a5aa2b993cbbdb0db130853e02144", size = 506381, upload-time = "2026-03-28T17:18:48.74Z" }, - { url = "https://files.pythonhosted.org/packages/0a/f9/17e8a70abe874ec694395119338fde2f13ee1903bd14f3fd5b310b77a1ea/aiohttp-3.13.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b3f00bb9403728b08eb3951e982ca0a409c7a871d709684623daeab79465b181", size = 755716, upload-time = "2026-03-28T17:18:51.918Z" }, - { url = "https://files.pythonhosted.org/packages/27/b3/fdb36e59b9fb37297b1651248d3d84e61faa49af2faabc1e243d3f75585f/aiohttp-3.13.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cb15595eb52870f84248d7cc97013a76f52ab02ff74d394be093b1d9b8b82bc0", size = 506500, upload-time = "2026-03-28T17:18:54.755Z" }, - { url = "https://files.pythonhosted.org/packages/cf/fb/dacf759c43cfb5fa32568bd369f054eeb23906ab23f4e3663e01e04c7988/aiohttp-3.13.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:907ad36b6a65cff7d88d7aca0f77c650546ba850a4f92c92ecb83590d4613249", size = 499881, upload-time = "2026-03-28T17:18:57.302Z" }, - { url = "https://files.pythonhosted.org/packages/52/cd/7824ee57dde8ca7f62e7fbc247ebe1aa3b5495d3598f0c516f06de1ef7ab/aiohttp-3.13.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5539ec0d6a3a5c6799b661b7e79166ad1b7ae71ccb59a92fcb6b4ef89295bc94", size = 1681734, upload-time = "2026-03-28T17:19:00.057Z" }, - { url = "https://files.pythonhosted.org/packages/7a/40/6f4ca61736a16deed2d2762a8dbeaaa48ad292974489be2a2f32f62a4e0b/aiohttp-3.13.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3b4e07d8803a70dd886b5f38588e5b49f894995ca8e132b06c31a2583ae2ef6e", size = 1653787, upload-time = "2026-03-28T17:19:03.026Z" }, - { url = "https://files.pythonhosted.org/packages/89/80/3793f0a1148a42190f6824ce9a0af79910cd3df8dfc58fa784234a7d9e41/aiohttp-3.13.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ce7320a945aac4bf0bb8901600e4f9409eb602f25ce3ef4d275b48f6d704a862", size = 1737964, upload-time = "2026-03-28T17:19:05.77Z" }, - { url = "https://files.pythonhosted.org/packages/15/fd/e41981d0f9e0dccfb8f2580d4e64e6c59d293b9b0815849950cc499fe53a/aiohttp-3.13.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:26ed03f7d3d6453634729e2c7600d7255d65e879559c5a48fe1bb78355cde74b", size = 1832226, upload-time = "2026-03-28T17:19:08.809Z" }, - { url = "https://files.pythonhosted.org/packages/fa/69/e6b566c638b37bfa14b98c2c429fcdba3b097a990acc9845fcc779ce39cc/aiohttp-3.13.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3f733916e85506b8000dddc071c6b82f8c68f56c99adb328d6550017db062d", size = 1681476, upload-time = "2026-03-28T17:19:11.502Z" }, - { url = "https://files.pythonhosted.org/packages/7d/8c/f1b7f03e745fa6281dd949673297c7ac54d7cc54d2e58beb5135ac5c6204/aiohttp-3.13.4-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b3d525648fe7c8b4977e460c18098f9f81d7991d72edfdc2f13cf96068f279bc", size = 1573061, upload-time = "2026-03-28T17:19:14.437Z" }, - { url = "https://files.pythonhosted.org/packages/bc/56/e7e972f1bed922297d72cc1d27bae6b2e28fdc2d6a895320e396a93c0f8a/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e2e68085730a03704beb2cff035fa8648f62c9f93758d7e6d70add7f7bb5b3b", size = 1653248, upload-time = "2026-03-28T17:19:17.432Z" }, - { url = "https://files.pythonhosted.org/packages/cf/98/3d63d2f2e06808911e103d6d47c400548cf26a23dd3275de594339ff8e96/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:797613182ffaaca0b9ad5f3b3d3ce5d21242c768f75e66c750b8292bd97c9de3", size = 1666599, upload-time = "2026-03-28T17:19:20.17Z" }, - { url = "https://files.pythonhosted.org/packages/da/c8/31e487fb16d37c89cc6ee190a424b218471750ac48a227e042e200a17687/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2d15e7e4f1099d9e4d863eaf77a8eee5dcb002b7d7188061b0fbee37f845899e", size = 1709919, upload-time = "2026-03-28T17:19:22.872Z" }, - { url = "https://files.pythonhosted.org/packages/c1/86/3b742bd9204b7deb4f61e6723b1f42a8211ccc60dfddb3e52a6cd4329d46/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:19f60011ad60e40a01d242238bb335399e3a4d8df958c63cbb835add8d5c3b5a", size = 1560523, upload-time = "2026-03-28T17:19:25.879Z" }, - { url = "https://files.pythonhosted.org/packages/72/63/6b80cef343a0527690588808d02aad7604cc4e23eaab207179e77dd607be/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c344c47e85678e410b064fc2ace14db86bb69db7ed5520c234bf13aed603ec30", size = 1731336, upload-time = "2026-03-28T17:19:29.02Z" }, - { url = "https://files.pythonhosted.org/packages/d4/3c/9b39bc9609cac87e19b3394b7ed4bbab3787b434b14e012b9e16be64e9d5/aiohttp-3.13.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d904084985ca66459e93797e5e05985c048a9c0633655331144c089943e53d12", size = 1667646, upload-time = "2026-03-28T17:19:31.797Z" }, - { url = "https://files.pythonhosted.org/packages/21/72/3fb0ea857c891de89f6914f737f7423b7fa4dd1f46d8ce621eb07595ff4c/aiohttp-3.13.4-cp39-cp39-win32.whl", hash = "sha256:1746338dc2a33cf706cd7446575d13d451f28f9860bebc908c7632b22e71ae3f", size = 441019, upload-time = "2026-03-28T17:19:34.79Z" }, - { url = "https://files.pythonhosted.org/packages/b1/61/8a7191782a31ae3c7f7cee2cd2e37b3ee5849666767db116d449cfe20b88/aiohttp-3.13.4-cp39-cp39-win_amd64.whl", hash = "sha256:a5444dce2e6fba0a1dc2d58d026e674f25f21de178c6f844342629bcef019f2f", size = 464025, upload-time = "2026-03-28T17:19:37.362Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/f0/f81190ba488cd106c2fc6d92680e56bb223bbbbf1e6908c2617011290112/aiohttp-3.14.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:692e409052e7436029bbb32977cd7c5bf806ac5fa4085b973996785ffadad33c", size = 760606, upload-time = "2026-06-01T19:36:39.054Z" }, + { url = "https://files.pythonhosted.org/packages/f6/54/444d37eebf0f15db661ca44ec7caf93962f3c5ca92eb4c9a5d888b70aaa2/aiohttp-3.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:40af7ebe53c7990e110dc4ad03566b12c3ac996254298a3d39046dd69cfcb2c2", size = 514677, upload-time = "2026-06-01T19:36:42.408Z" }, + { url = "https://files.pythonhosted.org/packages/d0/d1/da280e23321c132c0a3fa7c8cc2830621d79174edc64c829443346489a36/aiohttp-3.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02cb2ffbb7da32f82e21ad9952669c45bd88a80e0878264c2f59fe1c6fb2badd", size = 510155, upload-time = "2026-06-01T19:36:44.072Z" }, + { url = "https://files.pythonhosted.org/packages/09/b8/2e36d54d0991ec5bba451444004591ee0af58cb1662a3a81c562878b9c1f/aiohttp-3.14.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2514cb7195f6d7c219339635bea71ae47d1569b051300d32df9dcfabcdb869", size = 1699947, upload-time = "2026-06-01T19:36:45.762Z" }, + { url = "https://files.pythonhosted.org/packages/57/95/a31d8ea1a0b9ecc084f5a7dd0b431ce64ef585918bb7bdc82afe11843877/aiohttp-3.14.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:30e8b7eeb42d02c120ca90d6c6e076a221a16b70a6dac9ae44c7ab5104cc7fe4", size = 1664364, upload-time = "2026-06-01T19:36:47.653Z" }, + { url = "https://files.pythonhosted.org/packages/01/f6/5de3ddffc87a9e8d09b3be38fbd6dd1a736b2ad477a7e787dcb85f57f338/aiohttp-3.14.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63e38be0d75a654deaa06be32fb4cab883a4222940be1d05861b6717679cbadb", size = 1761186, upload-time = "2026-06-01T19:36:49.355Z" }, + { url = "https://files.pythonhosted.org/packages/33/8c/03c5438ec35d7e3a4f33fe895d6c3ec7540a7cec46065f21851211e1ee4d/aiohttp-3.14.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1210d4c87cc00128160c7384ab41877a701295b97cffa6362f908a49b6e8a7ca", size = 1849727, upload-time = "2026-06-01T19:36:51.478Z" }, + { url = "https://files.pythonhosted.org/packages/22/32/5a05303b0874458920b73f48b8779cc3a93d503f121b38dcc0456dbd698c/aiohttp-3.14.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1a78a77366ed158a0a54b076990e575d7b7cdb728cbfd02711eadab150f2269f", size = 1708197, upload-time = "2026-06-01T19:36:53.241Z" }, + { url = "https://files.pythonhosted.org/packages/7d/62/478f169488d61414c0a05e7fe423b59ae3d9dcc933d1f0e4acc2c5d5bc3e/aiohttp-3.14.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f4d2038c64f36df96cfd3fa0937910e231eafbf897e70a06c155a817bb632fa6", size = 1578147, upload-time = "2026-06-01T19:36:55.154Z" }, + { url = "https://files.pythonhosted.org/packages/1d/af/b20af85765658972d3337834bd5eebba91b962794f2b4fc3e0ee8c85c0e1/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4714c70067a08b604d0bf3bc4dfdf82e52944afab41d0428d460862763d2f79b", size = 1665836, upload-time = "2026-06-01T19:36:56.94Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a3/771879cfd59948f4544b172189048905feff802f20f1c6c5411e998a3e06/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:f79bfd2847513a7ac801bbafd1de02348a37926ac439eeb4bfe96fcff4eada15", size = 1680335, upload-time = "2026-06-01T19:36:58.642Z" }, + { url = "https://files.pythonhosted.org/packages/f4/16/582e36ad1d32133cd40659f3bc98e71c22179665a1cfbbb4713bce339c06/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:25e9f1d2465a210d60edb64d7b204a147e85d4c194eecef3d1604fb5ace678ce", size = 1731180, upload-time = "2026-06-01T19:37:00.583Z" }, + { url = "https://files.pythonhosted.org/packages/11/bc/80708fe3f64a07a2c306a42fc7b009118a952709761d215f6d1b4c57195b/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:b5314743ebe926c2fda35d0a298c565c885505f6635c2a30936363404cf274a7", size = 1565805, upload-time = "2026-06-01T19:37:02.446Z" }, + { url = "https://files.pythonhosted.org/packages/57/8f/8d25897f8273a32fe4ad40a8885eec4f397377ed46e8e383078169f60316/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:28eee8de1d69711c53116df8202f1c2aa0e3f80ef912a88fc18d159d53e7110b", size = 1742496, upload-time = "2026-06-01T19:37:04.222Z" }, + { url = "https://files.pythonhosted.org/packages/9f/7d/c341d32ab2dec56c8478740695743dc6c21b383cace9376a3eab16311a07/aiohttp-3.14.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:89ed35666c95d3efe1955056afcde09e62a57a34e2a4398b17f9f6c1564f0b25", size = 1691240, upload-time = "2026-06-01T19:37:06.277Z" }, + { url = "https://files.pythonhosted.org/packages/37/0f/a81207dd7a2d4a4f645b3a3f8b5a1da1159dc63117ffb137b698fd6df50f/aiohttp-3.14.0-cp310-cp310-win32.whl", hash = "sha256:5e4646e9a6af29af354204011bf5769cb0276ec5b64653e42f90b3e13845169f", size = 454686, upload-time = "2026-06-01T19:37:07.96Z" }, + { url = "https://files.pythonhosted.org/packages/7f/ae/842357f2afb9c915715c6f5775239d987f5d0f845abf7675fa794e0a9d40/aiohttp-3.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:22a8d06f204e0518a586d770032db3c7043c9ba3693081b3e3ad425e1458d594", size = 478677, upload-time = "2026-06-01T19:37:09.652Z" }, + { url = "https://files.pythonhosted.org/packages/6b/d1/330fb22c9535ec177b52396905131c6e39447244b6ca876262939af668ef/aiohttp-3.14.0-cp310-cp310-win_arm64.whl", hash = "sha256:4acfc34bd4d3c58754fc9f22ff1b5e92aabce68f3d4bf7b71a0b732d9bceb78a", size = 450364, upload-time = "2026-06-01T19:37:11.279Z" }, + { url = "https://files.pythonhosted.org/packages/67/47/7727bfe8db93f8835a001bd4359d8480cc68d1259b8bce334668f8be97bd/aiohttp-3.14.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:54bf3522d6f7351e55f89a62d5c2bf138ad557b031670266c5df604ae88e0b5a", size = 759147, upload-time = "2026-06-01T19:37:12.918Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f2/cd3fedff6fade73d71df9ec908c210cec518ef90fd00289250684b90aecf/aiohttp-3.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0746d9fb0ac4fdef643a84494efe3f06d50335dd8c7a530228b86448aae0a803", size = 513705, upload-time = "2026-06-01T19:37:14.633Z" }, + { url = "https://files.pythonhosted.org/packages/5a/fe/49746b6b610144a06323bebd8e1211a390310d8c69b98dd6d52df341bc3e/aiohttp-3.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9f3a96b6d39a4872222beee72e1df41d2ff886ae96152cf3e757ef8c5673ef0e", size = 509627, upload-time = "2026-06-01T19:37:16.385Z" }, + { url = "https://files.pythonhosted.org/packages/4c/3f/28f2f6cf3d5c0e7b01b27140d0e7873fd11fb341169ad3ce78ad04aba628/aiohttp-3.14.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d336820adbb914debbc90a1d8c1bfc4bea55996aecf64866a989d35d1f9fd903", size = 1769293, upload-time = "2026-06-01T19:37:18.067Z" }, + { url = "https://files.pythonhosted.org/packages/97/6f/2e5f1b525d5474b12b3c60abf733a755845f3bceff21542081ada515f837/aiohttp-3.14.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:71b2604c9bfc1b115547d63a094d5244b3f02799833513a99a68aaa7b167c4cb", size = 1732363, upload-time = "2026-06-01T19:37:20.138Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ce/596120faa85ca7b19cd061e3f2f3be23aa8f11a0aedf9191db9e0da1bd76/aiohttp-3.14.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:610d68800435903e303ca0542b9d3e4eb72a12ff33a6d471a070c1d81eebd3c2", size = 1840375, upload-time = "2026-06-01T19:37:22.104Z" }, + { url = "https://files.pythonhosted.org/packages/72/3c/a7ffe05a757a4a7867643da69357ec41f506879fbd1b231d2ed90af246b2/aiohttp-3.14.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:514db9a79337068981ee2137310283a07b4b885c584991097a91a4da419bcb81", size = 1921484, upload-time = "2026-06-01T19:37:24.068Z" }, + { url = "https://files.pythonhosted.org/packages/93/fa/2c861170bbd4a491de93a69e081db1d971092569e0d593a98ef62c384dc1/aiohttp-3.14.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c452d17eeb95d563fc8b936f3050301dbd1d268126c4632d8b70ede9696202ee", size = 1774153, upload-time = "2026-06-01T19:37:26.256Z" }, + { url = "https://files.pythonhosted.org/packages/9d/da/1d2f5a165f47ec9b1f69d37b8b977fdc4d501aa72ffb7930db27bb9e49ea/aiohttp-3.14.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ed94a81506e3d1bdbad5108f497a58f2a2354aedb4ca314d5326f07d1fd1ac2d", size = 1632569, upload-time = "2026-06-01T19:37:28.192Z" }, + { url = "https://files.pythonhosted.org/packages/46/1d/7a6e295c4257252f70f69e90864fdad74b6a1293054fb3f9e65a15de6d63/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1394dce36e0f0d260ac0b555a654de19cb989f3c1b8bdd24f505314dfea18a00", size = 1740325, upload-time = "2026-06-01T19:37:30.08Z" }, + { url = "https://files.pythonhosted.org/packages/f1/7e/e1899b1ca3ec62f1eab2a5cbde14039b97493f7f53eb88d9b668562ffa8d/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:d1467d1e7b48a73ca7237e0ee4335f3d02b923dbc27b82fd254bc301c97d4026", size = 1748691, upload-time = "2026-06-01T19:37:32.211Z" }, + { url = "https://files.pythonhosted.org/packages/ec/54/4e6b61c1fe7d3433f82bcc6bd7e4d7c683a742a10c9b12a025fd3695c047/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:6a5f3532125233c261cf61f32df4059cfcf482eb793c7d3db8452e3142028b86", size = 1814477, upload-time = "2026-06-01T19:37:34.173Z" }, + { url = "https://files.pythonhosted.org/packages/9c/38/86fd51be2e08d8e45c83d879d255f10391903cd9fe2a16512f7591a15873/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3ea81eb518a2ecb319d8ec6d1424a37c773f6634bd87d6985eb606b2faac419f", size = 1623393, upload-time = "2026-06-01T19:37:36.281Z" }, + { url = "https://files.pythonhosted.org/packages/78/49/466e947a42a88ee23c486d036e7e5d1b097f1bafd8084ad9c9a0a92f0f43/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:32e735c3182de7b64f6941a4ede48b38c7f47d9437bd615dd30b5bda8fa1bc93", size = 1824097, upload-time = "2026-06-01T19:37:38.421Z" }, + { url = "https://files.pythonhosted.org/packages/f3/89/35f3410bc284682338a1be6b6ea0c5abfa05f063942cfaa9256608440434/aiohttp-3.14.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c21ca9a1c63d4509158f478aeb9d02914dcc52adc68d1bc9dee2452284ee5996", size = 1764790, upload-time = "2026-06-01T19:37:40.755Z" }, + { url = "https://files.pythonhosted.org/packages/42/80/2d4291bd5724d3d17e5951aff5a3e02281483fb47295f0788276ee66cd73/aiohttp-3.14.0-cp311-cp311-win32.whl", hash = "sha256:19ca5fc84130675ba11c6ca5c7da5cb65f7bf8a32cdd2b616bf49cd334688aae", size = 454176, upload-time = "2026-06-01T19:37:42.837Z" }, + { url = "https://files.pythonhosted.org/packages/59/ed/41d0ad4f6ececffc32bdf1f7b494e5498f7ca5c849ea2e3cc9bbd1668251/aiohttp-3.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:d488e6e9d3bb8ba5ae7066d5be885ae9670eba021b8c6ccb9a3a568e6b19d6e5", size = 479334, upload-time = "2026-06-01T19:37:44.776Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/c0b5e305c770053f8c3d069bb52b8196917ba91949d1962d52eb307fb0d2/aiohttp-3.14.0-cp311-cp311-win_arm64.whl", hash = "sha256:8b93618102caf12801638a01a2b478a55410ddd71bd41cfaf6f707953a49ac43", size = 450262, upload-time = "2026-06-01T19:37:46.461Z" }, + { url = "https://files.pythonhosted.org/packages/89/97/2b6889bfb6b6847520d50d95eb8c4307a45e28aaca39faf4a9454b3d1b2f/aiohttp-3.14.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b29518c9c2ec7e373e68259206a137c7f4f5439c58baaec4b5ab3ab799850a4e", size = 750194, upload-time = "2026-06-01T19:37:48.164Z" }, + { url = "https://files.pythonhosted.org/packages/21/e2/62634b7fff918ed98c3c6b2f0e70d520f7f28846cb412d451b04354c6459/aiohttp-3.14.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dbec68ce61b64cb73cab4d33df9433427b1713c8bcccb181dce695c1b6f8e87c", size = 506966, upload-time = "2026-06-01T19:37:50.014Z" }, + { url = "https://files.pythonhosted.org/packages/dd/fb/5ce075150828c797a5106f1c2fb26034e709d4289b9d2bf8b07f1e59fac6/aiohttp-3.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3cdf534aa455593e589302990c5097aa5c92c06c4262a20da22934f9186a5fff", size = 507527, upload-time = "2026-06-01T19:37:51.96Z" }, + { url = "https://files.pythonhosted.org/packages/01/d5/405a0ae4e6b081754a3609c1c97c63a950e000a2def16046f1e736933a0e/aiohttp-3.14.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb6c657104393b5fbff01a5f59b2023db74058a8077d94475d6c25d03882a108", size = 1762420, upload-time = "2026-06-01T19:37:53.839Z" }, + { url = "https://files.pythonhosted.org/packages/ae/1d/e05a7c896b15a6bc6fb8fc5319eb437861c2c49c34559ef928add6590315/aiohttp-3.14.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:46fbbec4e4fab7428d4396a3823f9320e4560aa3113b89eeebce712c27c9ed5a", size = 1733672, upload-time = "2026-06-01T19:37:55.791Z" }, + { url = "https://files.pythonhosted.org/packages/cc/22/a72f7c459e195fa41bf4f7abd1f925b91fe91f8097e51c654229ba144a33/aiohttp-3.14.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2c2c7e05dd5335b298085abf45ddf98673934c3ee1c083d0b9ea13d4186ad500", size = 1805064, upload-time = "2026-06-01T19:37:57.931Z" }, + { url = "https://files.pythonhosted.org/packages/80/50/e85bdaba0be59ca4838005ebfef4048fcdd5f35a02b07057a9a123394440/aiohttp-3.14.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3c7139100fbaae76515b73051d8f0aa3a3ff02e415eec8a8eee8e2223d9ba955", size = 1902125, upload-time = "2026-06-01T19:38:00.225Z" }, + { url = "https://files.pythonhosted.org/packages/19/d8/51de5c6b971c27bb1ef620293b8d1ca611ec78736b34b3f6ccf68e4c8785/aiohttp-3.14.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d6f9286a629ce52728430afe18f8ed2b6c39a1fddb3802d7244b9983910ad2", size = 1783112, upload-time = "2026-06-01T19:38:02.641Z" }, + { url = "https://files.pythonhosted.org/packages/73/ae/b4402bfde77e43dfb1b6ccff83c7b7ab63ed06b50c4754f0c5423fb374fe/aiohttp-3.14.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cc3c3e12cdaeb92d7dcf13db00e9f6b1956b910e47256e696df1cfa946d02159", size = 1586356, upload-time = "2026-06-01T19:38:04.637Z" }, + { url = "https://files.pythonhosted.org/packages/bc/05/750a3265ca4dc54a460bd0cb1121a8f2ce9171fce4a135fb47ea7fd594d2/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4d6a998191f5ebe3b8c28463ff72bc030250008b3193c402464efadd08b5ca02", size = 1723119, upload-time = "2026-06-01T19:38:06.713Z" }, + { url = "https://files.pythonhosted.org/packages/37/01/8c0812c50b3b1b1c37b323bf170d6be8847a8f234060485b7d1e71953f60/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0fc2b75ae8d169d853be2862d960be8550da6c5c65711d5476407eb3fdb006bd", size = 1757216, upload-time = "2026-06-01T19:38:08.736Z" }, + { url = "https://files.pythonhosted.org/packages/47/2a/50fb98028a26887cbe48dcc1df92a90825615bc73b5584301304090cded8/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:16eee56bcc72d04600bc56c1759982c2385ec0b41d3fd3521f836bf64a0957ef", size = 1770500, upload-time = "2026-06-01T19:38:11.111Z" }, + { url = "https://files.pythonhosted.org/packages/bd/32/0ffd598a2fa2b9a423daf242e700cfdabda35d6e602394ad9ae58972c1c7/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5a2e7ca615c3ddc15b82687e05a624e5f5cba3f1d6c20cb81172d70ea498451e", size = 1576224, upload-time = "2026-06-01T19:38:13.391Z" }, + { url = "https://files.pythonhosted.org/packages/0b/f9/b9fc381dd9b66afb33f2634c40e229d106467be0afcabe79648631ab6712/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f0b7b8bbbec3ce9467ee0ebe334622fd90624f593edd3136c567811453fc4fae", size = 1794252, upload-time = "2026-06-01T19:38:15.498Z" }, + { url = "https://files.pythonhosted.org/packages/a8/fb/05d9214c975f23225a8cd5c439325e338c7c377b315480ef3871db51f54e/aiohttp-3.14.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5ba10966d4f03dd96a14365be4b8e37c327c76f11c3ca867116966cdd9f98066", size = 1760193, upload-time = "2026-06-01T19:38:17.624Z" }, + { url = "https://files.pythonhosted.org/packages/d9/4b/02992fc4fb9e1b6673ee3f888a8e587a6447afda1f6f4aca776c148c2876/aiohttp-3.14.0-cp312-cp312-win32.whl", hash = "sha256:101df7779c80c0636014a6b2c6642acd3efb5b355d48347c9d7dfb720aee9430", size = 448650, upload-time = "2026-06-01T19:38:19.545Z" }, + { url = "https://files.pythonhosted.org/packages/39/e9/246532214c3abda518477cbaaf16d420295ad8effa5233844cbb38f299ab/aiohttp-3.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:b0a5747586d4467efd1f932710b269131c9717a872dce082cd92a00c1c13123a", size = 476145, upload-time = "2026-06-01T19:38:21.505Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c3/63f8c20090048915711598b0adf475b149216d736157961de06480a45b15/aiohttp-3.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:5f1c5be60add78fabb4aacd13c5a348ae79d2fcbfc7fa78da8f1eb192273b370", size = 444250, upload-time = "2026-06-01T19:38:24.027Z" }, + { url = "https://files.pythonhosted.org/packages/21/61/d11f7d9a3144bffe825247d6367cd93053666da50b94707c9129c78868d5/aiohttp-3.14.0-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:25400d710641a8040bf022a8a99f579e581ffa1c5bd42c33255d7d6f3957c127", size = 502399, upload-time = "2026-06-01T19:38:25.955Z" }, + { url = "https://files.pythonhosted.org/packages/4f/9b/a7e317625d36356844f8bb022cabd305b541f968856cc3c2e0b58e53ee6e/aiohttp-3.14.0-cp313-cp313-android_21_x86_64.whl", hash = "sha256:c5492b9929826e07cc3fcb9739ae87aab05dff6b5e67a9b73fd1700c6d008981", size = 510068, upload-time = "2026-06-01T19:38:27.828Z" }, + { url = "https://files.pythonhosted.org/packages/11/41/cc2d2cfbfbdc3126ba258f3cd27d1ac8a33492ae3c35a4583ee21f0ba7f1/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3366751d68d237c621264233a32f3078bbc21b7904ab90a77e03d21390c742c6", size = 481670, upload-time = "2026-06-01T19:38:29.836Z" }, + { url = "https://files.pythonhosted.org/packages/3c/07/381f4023c3b08cb616e520f566d8c58957abad54e56441d41fe67cfb0195/aiohttp-3.14.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:57ea07d28695a7a40304d42251892a8df765e5588c10ee32afeddcd5df33c0a2", size = 487591, upload-time = "2026-06-01T19:38:31.704Z" }, + { url = "https://files.pythonhosted.org/packages/fb/4d/4506fdb7a022bdf70011a3bbb4ca00c5c570026ef6a3c5bd7bc70c39089c/aiohttp-3.14.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:076cb014191ae2e65d949e1ad01f1dcfe33e32789b5172510f3e79c79fc04d50", size = 496503, upload-time = "2026-06-01T19:38:33.6Z" }, + { url = "https://files.pythonhosted.org/packages/ef/7d/c814111e04894a45d9e2defc94443879a6f118d9633d5fedfe6e2e8af5f0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2f3fc37054564dee64a855b5b092d87ec35dcddfaabf7dacb1c8a2b1f83dc0a9", size = 745870, upload-time = "2026-06-01T19:38:36.013Z" }, + { url = "https://files.pythonhosted.org/packages/c6/ee/80eee0efddfe187e7cd05027086b7ce1c0e492e82a4eda58f5c5543a44a0/aiohttp-3.14.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8fcaef74d2ab0f607d7ff85a0d15e21bb5a258c4a58df1908396eb50d7f4ed3c", size = 505588, upload-time = "2026-06-01T19:38:38.282Z" }, + { url = "https://files.pythonhosted.org/packages/d6/f8/0f28f04eef75d52fc9c715dde7ce9c0abb810fd20cfeb0fea7afd2ab1e98/aiohttp-3.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e4c01b0bfc6209590960e68eac083cd22d5d87c21f974dd6208cafa5d3542bc8", size = 504492, upload-time = "2026-06-01T19:38:40.611Z" }, + { url = "https://files.pythonhosted.org/packages/ff/db/44c755232085545065c94378dfce38641b1aee647f4939fcd32f5b32e719/aiohttp-3.14.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f12eb7896e81caf403a2b18c9406426f1207361e7239c057ab29c076d4257e83", size = 1752111, upload-time = "2026-06-01T19:38:42.682Z" }, + { url = "https://files.pythonhosted.org/packages/5e/6a/42e030a46743841414402a3b00cd3d78419055e86c66fb5822c14b5abfc6/aiohttp-3.14.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6c79a044cacf360ec46738d863d2f41c9300d2a06ef4a7402ea0df306a350e61", size = 1729674, upload-time = "2026-06-01T19:38:44.79Z" }, + { url = "https://files.pythonhosted.org/packages/34/26/3199beb415202e3108e7b83ecebe10914d806d33fb9860c3e4aa60a19be3/aiohttp-3.14.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:85e0675f47be4eff0636bf88c02140ea89168ae0df3ff1f3f464e9de9610d277", size = 1798808, upload-time = "2026-06-01T19:38:47.01Z" }, + { url = "https://files.pythonhosted.org/packages/bd/94/b9b6fcf0ee17c21d0d19fb8c22bf83ad18f82e702a9c3bd901a868f5e446/aiohttp-3.14.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7b33e751cab03fdc960095b1e326cb5a03f5ee577d6ded59f3d1c100f8668882", size = 1891921, upload-time = "2026-06-01T19:38:49.233Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a3/3800dbd095cb2bb165a7ea5d94d790914677e27f45638c7d80e3f34c8945/aiohttp-3.14.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:26d9224c6dd7f5c749aba4f61315a894601448b28d94d12f4dea0903e26d2096", size = 1777241, upload-time = "2026-06-01T19:38:52.04Z" }, + { url = "https://files.pythonhosted.org/packages/21/2a/45be91ad1b860508557448d4cc2e165a2ee68dd865657b73bf66cc5a00fb/aiohttp-3.14.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6281aecdf2732940f4fe06bd6adec5ae4d59b78b080b8e3a6b81467301010988", size = 1579554, upload-time = "2026-06-01T19:38:54.508Z" }, + { url = "https://files.pythonhosted.org/packages/b4/3d/dc94df99ed1511fdf28314f722643ed334112643cab00223577085e788c4/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23e8314e7aed8576fbe33314d218bd81447a3adbc91dc36f1163bf583cd3084c", size = 1714864, upload-time = "2026-06-01T19:38:56.788Z" }, + { url = "https://files.pythonhosted.org/packages/ae/e4/1f1c8acbb3acd5c8f795473b92c9c3d44eb60a5692c6104256c8a1c83a0c/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3b54fbff46127aeafdd764cecd0d99fa2f24a0e37ea5c18a7c3a4ac450df1db3", size = 1749803, upload-time = "2026-06-01T19:38:59.367Z" }, + { url = "https://files.pythonhosted.org/packages/0b/c8/c45ea6e7ed84cebba939b9c334498a045ba19d79c61b0110df5f21580de3/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b27d89af91a555f58e08e4902dbcbc48862fd40095720ca705990476bd93b7ac", size = 1765023, upload-time = "2026-06-01T19:39:01.651Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a1/a932941784432962fe390e1066823aaef64b4e5ac9fa595df57b5fe472a9/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:25d2326a4967bf705a9f9913a13005e93b6020ad8a9f6bd6bd78850d5171332e", size = 1571671, upload-time = "2026-06-01T19:39:04.044Z" }, + { url = "https://files.pythonhosted.org/packages/b0/01/e1280feac522597a4d46eb67a0cdfa053cfae263033030b761ab146f29fb/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a1d209375c503472b3c0a340cdf3c55fcd82e84b46dda7caeaced59faba373ec", size = 1789904, upload-time = "2026-06-01T19:39:06.294Z" }, + { url = "https://files.pythonhosted.org/packages/fa/10/ab28818262f4d26bdb47ed5f1fc7999b69e2fc6e0370b02d0f49011f45ea/aiohttp-3.14.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:666c7c5036df57b693026398b69b41874a1931ac5b3485fd910e57bfac253869", size = 1754516, upload-time = "2026-06-01T19:39:08.788Z" }, + { url = "https://files.pythonhosted.org/packages/af/cc/c122eabd7a1b7e0c9bbdd6be60e4715905b858399145d9df872bb94f1427/aiohttp-3.14.0-cp313-cp313-win32.whl", hash = "sha256:23f094a1ef64823fd35854ddf5c7a80a078162f37f9d2f7c6142b51a6affa456", size = 448656, upload-time = "2026-06-01T19:39:11.171Z" }, + { url = "https://files.pythonhosted.org/packages/41/a5/bab07d79848a00eedd8ed979ccb302aaea3ac6eb9fa16bd0ed87135869b4/aiohttp-3.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:e03abdaa17d553f17e1d1d06bb266b3970106c78051d06795723e748d8e49d11", size = 475803, upload-time = "2026-06-01T19:39:13.439Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a0/f03ade8566c153666a3871afccbedf6d99911da006325e1fc6cf72a2de99/aiohttp-3.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:acdb400538cf4769543548bb5d1eb23d39bed4f96554a6078cb728c7cb2c268b", size = 443889, upload-time = "2026-06-01T19:39:15.945Z" }, + { url = "https://files.pythonhosted.org/packages/28/03/5f36ab196a88ba5e9648ae5643e6531e67a3a8c0e96f9c6510ff41540fec/aiohttp-3.14.0-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:363ef9e91014e7891679bfb2ac0a7c6ea93435dbbfd10ecf41b9f06fcf506c5f", size = 503330, upload-time = "2026-06-01T19:39:18.195Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ce/8b49ec2f30f68e02f314f4832186cd45e583360a5a386058be36855d23b6/aiohttp-3.14.0-cp314-cp314-android_24_x86_64.whl", hash = "sha256:884a4edbdad77be9d0ef36142c8b504351b170df0bf62b51e784fadabf311c42", size = 509822, upload-time = "2026-06-01T19:39:20.396Z" }, + { url = "https://files.pythonhosted.org/packages/1a/fe/6edbf5d39bf29322b6816365b17ed8ede4dace164a3aea1abcd30110eb78/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:70ea956f6cc4a37620966b56c2e205d88ca3e6d85ec063277e414b1035cddad3", size = 483329, upload-time = "2026-06-01T19:39:22.607Z" }, + { url = "https://files.pythonhosted.org/packages/1b/5a/fae531bdbc6456fb6241f46b7b81e4d8a0dd3fc09118a0055dc7141ac1ec/aiohttp-3.14.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:ea3b9806c89f61da22fddf1f12dd524fb368e5e28f1261fbdafe5c3cd8ce893b", size = 489502, upload-time = "2026-06-01T19:39:24.881Z" }, + { url = "https://files.pythonhosted.org/packages/36/f4/48a7b0414db7fed77a03d5dde34508c026afd83510ab6bca08c313855776/aiohttp-3.14.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:a071be341c2bd9b0188e62d173509f024e0a35b1c342c53c50f8daaeda8c3bd8", size = 497357, upload-time = "2026-06-01T19:39:27.197Z" }, + { url = "https://files.pythonhosted.org/packages/75/75/e85a13a370acc007fca5feb1fd1b88ac2d8426e6dadd625479b7cadd55a3/aiohttp-3.14.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:198cfe61bf253b19da1fb3e0fa122249dc4f14c12709493fed8054aa0411cc76", size = 750898, upload-time = "2026-06-01T19:39:29.563Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e4/3d637f800c724eff0e2bed64df72557444482366fd0a35b0cec0e6968f6c/aiohttp-3.14.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9dc203d6ce6b9106d54e2a93f41dfdfebfbca2d99962ba503bfd3e5921a6549e", size = 506986, upload-time = "2026-06-01T19:39:31.872Z" }, + { url = "https://files.pythonhosted.org/packages/1d/df/35161f3598bf7501d2b2a805b41ab4f45a2e34150c421bcb4ef8c0d281a7/aiohttp-3.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9e19d17ab02bf16832a2c8c0d55a486792c5b1645665652ee9531aebcc30cb72", size = 508033, upload-time = "2026-06-01T19:39:34.137Z" }, + { url = "https://files.pythonhosted.org/packages/e5/39/b36e5d3d31e850fb4691dd3e941684ac490a2559249f6fa634b6b0fdf020/aiohttp-3.14.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d925fba0c14d5b498a8028b0107beebdfd16c5d48d702ff54f879cb017aaaca3", size = 1746213, upload-time = "2026-06-01T19:39:36.654Z" }, + { url = "https://files.pythonhosted.org/packages/b1/28/24e1409e605a9aa5d84abe0e2acb365354b70ae56d40948101cabe3341ab/aiohttp-3.14.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d33e61021222ce7f9792bcac870d6f58d8adfceda33ab857b01264f4560f2c5f", size = 1705862, upload-time = "2026-06-01T19:39:38.968Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d0/e5eb3ff1daeaf644c7e36a957517672494122628e067c38b263fa04eda77/aiohttp-3.14.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:44eca38755d0105bb32f47d085f5dd449846a449e1245fc105889e3279dcf8e3", size = 1798909, upload-time = "2026-06-01T19:39:41.334Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ba/8943f906f0570342886ababb9a722a44e360f786a028c5e0b0e29e3f735b/aiohttp-3.14.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f13087e06f68fea4941c21a0c541c00553aa16e4f8fd7bbe2b198df761e964d6", size = 1868892, upload-time = "2026-06-01T19:39:43.807Z" }, + { url = "https://files.pythonhosted.org/packages/3a/05/27df32c844b2156e1675a8d8ec22d963e3c8ba469ed7ceb1863320c7b521/aiohttp-3.14.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ff82be7f1ef73634cb77890a770743239bc3d487b848669be1c599889336dc0a", size = 1751659, upload-time = "2026-06-01T19:39:46.398Z" }, + { url = "https://files.pythonhosted.org/packages/7f/62/da182e5910ab912b2e88aa919b61a16046a37a95714a5795b02eb57b2d18/aiohttp-3.14.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a150c0875ac8fd87f1c398650841308a30d65facf7416b12dbdb9cfdcbe5a48c", size = 1578775, upload-time = "2026-06-01T19:39:48.902Z" }, + { url = "https://files.pythonhosted.org/packages/66/e3/53c67097e8a5ce98625e91e3fa7f43c9c6940de680345d03b3509a72a078/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:edc01ea4e1ec5a1649a28866262bf24195889ff7b27bdd947029a6086741de9b", size = 1710090, upload-time = "2026-06-01T19:39:51.392Z" }, + { url = "https://files.pythonhosted.org/packages/dd/55/0e2732ca598c7a4dfe8a775662376d0ca2977cb1030e48386d4da5d9a456/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:540632bf882ff8fc88f2e1697be0761578e89e0d79fb4a8a6d65dc5da7e729d4", size = 1715016, upload-time = "2026-06-01T19:39:53.807Z" }, + { url = "https://files.pythonhosted.org/packages/5a/96/f0b73730798c9ca525afc30b39f1f81bbe24e245d9654c54d3b39d63212d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:860a86bc2c80237f5dff52edcf427e10a8d8352271fd84845429a3e60199e02c", size = 1763810, upload-time = "2026-06-01T19:39:56.31Z" }, + { url = "https://files.pythonhosted.org/packages/71/cc/11acb6c4518f448323405a7312b6f255d0f974a34373ad1db7633c4aadc8/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5cbd50e6a50d6b99283a826b18cbdebf65b0797689a7535cb0e9dd37be0f63c3", size = 1573064, upload-time = "2026-06-01T19:39:58.718Z" }, + { url = "https://files.pythonhosted.org/packages/de/2d/28c31dde0a7dc98c0ee7d0da2ddcec3f7688c4fc131e5989e278d0c03c0a/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:20144819e99db593e22bbd2f3f2691a5e149f879142d6b8670254708853ff4fb", size = 1775765, upload-time = "2026-06-01T19:40:01.195Z" }, + { url = "https://files.pythonhosted.org/packages/b8/69/155c4ef3aec96417d47024800472b33b16c5d8a665371dcd044c2afdf25d/aiohttp-3.14.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:26b6d79aa54cb4ed50cc7d41ed14e99e0f1fc8e7c2d42f2e05b37aea897b2b52", size = 1733716, upload-time = "2026-06-01T19:40:03.631Z" }, + { url = "https://files.pythonhosted.org/packages/5f/44/6126116fd8a316b712bb615660b855c78466bb67ba1bb1742427eafcf7ac/aiohttp-3.14.0-cp314-cp314-win32.whl", hash = "sha256:106ed074a856f3e21d186b8579e2c8afb6da598e267cdaab01059e13db2fc44d", size = 453684, upload-time = "2026-06-01T19:40:06.277Z" }, + { url = "https://files.pythonhosted.org/packages/a2/d7/eff4c58a88c5cac5e38b55f44fb8a6d3929c3cbd77356e383e094d3220bd/aiohttp-3.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f770846edae8f00ecc57af825bce811f787f87a7dcf0e90d191790efe5b31f7", size = 481758, upload-time = "2026-06-01T19:40:08.653Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ed/17b5bd9fbcb46e688f02e572f517754a9a75831e7b54702f027761dc4fa5/aiohttp-3.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:acf1581c4f21ed4b80a2dded504d87b055a071a84d5737ea966435f768275ac6", size = 450557, upload-time = "2026-06-01T19:40:11.03Z" }, + { url = "https://files.pythonhosted.org/packages/12/34/6180103ce9aabc8ebff3f7bb55a1228ffe60f61042823031d9692cb7b101/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:6aa1a40f9cbb3da9f80714c5966b8946c21e6a2530d809b9498b33161e3c8733", size = 787878, upload-time = "2026-06-01T19:40:13.401Z" }, + { url = "https://files.pythonhosted.org/packages/92/e9/08954a40e8b7baa3d8beadd2b074b186e9b1e9c8ddabc288678a6265de50/aiohttp-3.14.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:b62af5a8cc96a194eaa01a9ed7b34a3ffa58d3d8daaa1a0d7a749353ad12d228", size = 524400, upload-time = "2026-06-01T19:40:15.972Z" }, + { url = "https://files.pythonhosted.org/packages/08/6a/b5965a634ac4d5ba99a463314cf4ab214ca073fcdc38a15e0294273701fc/aiohttp-3.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6eb63b1417efaf7d1002a6ad034a40d44376afcc16508a57f8e74b49ad26a095", size = 527904, upload-time = "2026-06-01T19:40:18.28Z" }, + { url = "https://files.pythonhosted.org/packages/06/b4/932bcdd850c354d9bcca30f360e475d7852e30413fbbd44b182782ed5432/aiohttp-3.14.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c20b9ad156a79eb97be5cf9e069eec01d2f0dc8472ffbd75299a8b2d4c2cbbde", size = 1912162, upload-time = "2026-06-01T19:40:20.825Z" }, + { url = "https://files.pythonhosted.org/packages/c6/85/ce79bab0310d2e3fd2d7bc7e44412abeff7c8338f8a21dd0f2f1714989e5/aiohttp-3.14.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:40ae7b0642c25632c7eabc4a04754012691864d2a1b93becf7cddb76027b838a", size = 1778813, upload-time = "2026-06-01T19:40:23.726Z" }, + { url = "https://files.pythonhosted.org/packages/05/54/ba62ac2d1bc87e010aad23751e383b8794e45d931df67677313a2da78823/aiohttp-3.14.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:95f5217e76a046b9f228a101717ef8d42b1eb3d9d196d15202db5bf41df88936", size = 1899969, upload-time = "2026-06-01T19:40:26.406Z" }, + { url = "https://files.pythonhosted.org/packages/dc/82/7cc7907725d83a19f31551334061e1ab8e108b1d7ac52632a2a844a4acb5/aiohttp-3.14.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1a4a9f17e85b80878c176695c1998c790e83731d8271881e5d356488652a1f9e", size = 1991771, upload-time = "2026-06-01T19:40:29.061Z" }, + { url = "https://files.pythonhosted.org/packages/d0/1c/a57de71a4508c93a830b77c28af3d08cd97f606dedfc6b94275347744508/aiohttp-3.14.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:145262119b07d7f95abc1839add35ba2bfc84551d4b4660ca11542c0b215455b", size = 1868606, upload-time = "2026-06-01T19:40:31.843Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ae/3839726cd49150a53ed340cc24ce5ba09d4c2117020ef9d45542bec5eb2f/aiohttp-3.14.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:49a33ded29b0b2fa7a367a02cf0fb89af602bb87542a16177ec8ce1c9c51d12a", size = 1665437, upload-time = "2026-06-01T19:40:35.01Z" }, + { url = "https://files.pythonhosted.org/packages/35/1e/c237923232c7da7f0392ea25d89fc5e60c0e93f685f4ebca8e7bcdd5271c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2cc736a9c9fc2bc4dd71fd404815741b6573df27c3f985948ec4076989ac57de", size = 1834090, upload-time = "2026-06-01T19:40:37.733Z" }, + { url = "https://files.pythonhosted.org/packages/98/02/a5a7a2524f92d3911761b405a7c067c751891942144adc13e2ad79611e39/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b4141a3e5342ee3053a9cab54d25b64ed28289c1041e4c54b3d99839314d90ce", size = 1816907, upload-time = "2026-06-01T19:40:40.46Z" }, + { url = "https://files.pythonhosted.org/packages/fa/76/a8b9f0d09234d516af9f2d7dd715557f33b5da3b0b56ead41d1170e86e3c/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e30871b2d58996cb81aac52d2b1d15ac05257131ef0f90f18c2115a380fbfe7c", size = 1840382, upload-time = "2026-06-01T19:40:43.48Z" }, + { url = "https://files.pythonhosted.org/packages/c9/8e/140e715a0a4bbc211979ea30ec8396ad2ed5bf90ab87d8058fc4668b1923/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:667b881d083ccae3900ea5a241e17e5007ca78844c53ed389bb63d48f729d9c7", size = 1659497, upload-time = "2026-06-01T19:40:46.265Z" }, + { url = "https://files.pythonhosted.org/packages/10/c7/7ba5de8af9650b9767b063c675427b8685f43fa7ce563673a7bc3af60f08/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:b584dfe615d151e9b8f0a8ecb3aee6147f2927ec5b95ba25fe621f5377510928", size = 1870829, upload-time = "2026-06-01T19:40:49.583Z" }, + { url = "https://files.pythonhosted.org/packages/cc/bc/2aaab2f85cadb26ea59c091fa2b8e370d625154b5c14b478f1b489d07551/aiohttp-3.14.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6199707cc40e0e9cd39c36fbc97bec416c704e1d0ddce03412bb3b3e6a90ccd0", size = 1832281, upload-time = "2026-06-01T19:40:52.303Z" }, + { url = "https://files.pythonhosted.org/packages/39/98/31b9ad9fbc01f0075ee7221002df5fd2d10b647f451ca5f30edc802d9dd6/aiohttp-3.14.0-cp314-cp314t-win32.whl", hash = "sha256:a8d93334d4961c9d566b1f046c81dee475b7c21eb730728d38237bfa70d1c8e6", size = 490597, upload-time = "2026-06-01T19:40:54.937Z" }, + { url = "https://files.pythonhosted.org/packages/59/1f/299b21441c8de42ff70fddc7cfe65e92f810abcf740739a09b56f7835364/aiohttp-3.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2d2ffe9b614f50f069068b3b52e73414e4107fc10b7efc939a76acff9251fdd2", size = 525789, upload-time = "2026-06-01T19:40:57.306Z" }, + { url = "https://files.pythonhosted.org/packages/70/11/7f83fcba9ee05d4c54d61b3f8104da0d43a59adac44dd28effc0c9a10422/aiohttp-3.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:7a3fc4358e65826c515350f199c210de747cf669998211b1ee6c2e46de364b24", size = 467399, upload-time = "2026-06-01T19:40:59.993Z" }, ] [[package]] From 0d4afa8f392a14d65056712a2f1d938866a46f91 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Thu, 4 Jun 2026 17:32:26 -0500 Subject: [PATCH 026/177] fix(mem-wal): fence predecessor with a WAL sentinel on claim (#7110) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem A newly-claiming writer only bumps the manifest `writer_epoch` — it writes **nothing** into the WAL, and WAL slot keys are position-only (no epoch in the path). So until the successor actually writes a slot, a fenced predecessor's next `append` lands in the empty next slot, the `PUT-IF-NOT-EXISTS` succeeds, and `append` returns `Ok` **without** a fence check (the `check_fenced` call only fires on the `AlreadyExists` branch). That window is a correctness hole: the predecessor false-acks a write that later dies at the seal-time manifest CAS — and if the successor already replayed past that position, the entry is orphaned (data loss for an acked write). ## Fix On claim (`epoch >= 2`), drop a **data-less sentinel** WAL entry at the current tip, **before** replay: - The predecessor's next `append` now collides at that slot and surfaces the fence via the existing `AlreadyExists -> check_fenced` path. - Writing the sentinel *before* replay guarantees any predecessor entry that landed *below* the sentinel is recovered by replay rather than orphaned. - A lost slot race (a predecessor/concurrent claimer wins the probed slot) re-probes one past the winner; that entry then sits below the sentinel and is still replayed. - Sentinels carry zero batches (empty-schema Arrow IPC + `writer_epoch`/marker metadata) and are skipped by replay's existing empty-batch guard. Epoch 1 (a fresh shard) has no predecessor, so the sentinel is skipped there. This lets writers rely on collision for fencing instead of issuing a per-put fence-check GET on every append. ## Test Adds `test_fence_sentinel_fences_predecessor_without_successor_write`, which exercises the exact race: a successor claims a higher epoch and drops a sentinel **without writing any data batch**, and the predecessor's next append is fenced. Also asserts the sentinel reads back as zero batches and the successor's own writes land after it. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance/src/dataset/mem_wal/wal.rs | 116 ++++++++++++++++++++++++ rust/lance/src/dataset/mem_wal/write.rs | 8 ++ 2 files changed, 124 insertions(+) diff --git a/rust/lance/src/dataset/mem_wal/wal.rs b/rust/lance/src/dataset/mem_wal/wal.rs index 6232ab0d8ae..7b95d94305a 100644 --- a/rust/lance/src/dataset/mem_wal/wal.rs +++ b/rust/lance/src/dataset/mem_wal/wal.rs @@ -39,6 +39,10 @@ use super::memtable::batch_store::{BatchStore, StoredBatch}; /// Key for storing writer epoch in Arrow IPC file schema metadata. pub const WRITER_EPOCH_KEY: &str = "writer_epoch"; +/// Marks a WAL entry as a data-less fence sentinel (observability only; +/// replay skips sentinels via their empty batch list). +pub const FENCE_SENTINEL_KEY: &str = "fence_sentinel"; + /// Watcher for batch durability using watermark-based tracking. /// /// Uses a shared watch channel that broadcasts the durable watermark. @@ -882,6 +886,52 @@ impl WalAppender { self.manifest_store.check_fenced(self.writer_epoch).await } + /// Drop a data-less sentinel at the WAL tip so the predecessor's next + /// `append` collides on PUT-IF-NOT-EXISTS and learns it is fenced, rather + /// than succeeding into the empty next slot. Call *before* replay: any + /// predecessor entry below the sentinel is then recovered, not orphaned. + /// On a lost slot race, re-probes one past the winner. Seeds next position + /// past the sentinel; returns the sentinel position. + pub(crate) async fn write_fence_sentinel(&self) -> Result { + let sentinel = Bytes::from(serialize_fence_sentinel(self.writer_epoch)?); + let mut next_pos = self.next_entry_position.lock().await; + let mut pos = match *next_pos { + Some(p) => p, + None => self.discover_next_position().await?, + }; + let mut conflicts = 0; + loop { + match atomic_put( + self.object_store.as_ref(), + &self.wal_dir, + &wal_entry_filename(pos), + sentinel.clone(), + ) + .await + { + Ok(()) => { + let next = pos.checked_add(1).ok_or_else(|| { + Error::io(format!("WAL position overflow for shard {}", self.shard_id)) + })?; + *next_pos = Some(next); + self.next_entry_position_hint.store(next, Ordering::SeqCst); + return Ok(pos); + } + Err(AtomicPutError::AlreadyExists) => { + conflicts += 1; + if conflicts >= MAX_APPEND_CREATE_CONFLICTS { + return Err(Error::io(format!( + "fence sentinel write for shard {} failed after {} conflicts", + self.shard_id, conflicts + ))); + } + pos = self.discover_next_position().await?; + } + Err(AtomicPutError::Other(error)) => return Err(error), + } + } + } + async fn discover_next_position(&self) -> Result { if let Ok(Some(manifest)) = self.manifest_store.read_latest().await { let hint = manifest.wal_entry_position_last_seen; @@ -1053,6 +1103,28 @@ fn serialize_appender_batches(batches: &[RecordBatch], writer_epoch: u64) -> Res Ok(buffer) } +/// Data-less sentinel: an empty-schema Arrow IPC stream with the writer epoch +/// and a marker flag, no batches. Reads back as `(epoch, [])` so replay skips +/// it. See [`WalAppender::write_fence_sentinel`]. +fn serialize_fence_sentinel(writer_epoch: u64) -> Result> { + let mut metadata = std::collections::HashMap::new(); + metadata.insert(WRITER_EPOCH_KEY.to_string(), writer_epoch.to_string()); + metadata.insert(FENCE_SENTINEL_KEY.to_string(), "true".to_string()); + let ipc_schema = Arc::new(ArrowSchema::new_with_metadata( + arrow_schema::Fields::empty(), + metadata, + )); + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &ipc_schema) + .map_err(|e| Error::io(format!("failed to create fence sentinel IPC writer: {}", e)))?; + writer + .finish() + .map_err(|e| Error::io(format!("failed to finish fence sentinel IPC stream: {}", e)))?; + } + Ok(buffer) +} + fn deserialize_appender_batches(bytes: Bytes) -> Result<(u64, Vec)> { let cursor = Cursor::new(bytes); let reader = StreamReader::try_new(cursor, None) @@ -1584,6 +1656,50 @@ mod tests { ); } + #[tokio::test] + async fn test_fence_sentinel_fences_predecessor_without_successor_write() { + // The race the sentinel closes: a successor claims a higher epoch but + // has NOT yet written any data batch. Without the sentinel, the + // predecessor's next append lands in the empty next slot, succeeds, + // and false-acks. With the sentinel, the predecessor collides. + let (store, base_path, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + + let first = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + let schema = create_test_schema(); + let batch = create_test_batch(&schema, 1); + first.append(vec![batch.clone()]).await.unwrap(); // position 1 + + // Successor claims epoch 2 and drops a sentinel at the tip (position 2) + // — but writes no data of its own. + let second = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + assert_eq!(second.writer_epoch(), 2); + let sentinel_pos = second.write_fence_sentinel().await.unwrap(); + assert_eq!(sentinel_pos, 2, "sentinel should land at the tip"); + + // Predecessor's next append collides with the sentinel and is fenced. + let err = first.append(vec![batch.clone()]).await.unwrap_err(); + assert!( + err.to_string().contains("Writer fenced"), + "expected fence error from append, got: {err}" + ); + + // The sentinel is data-less: a tailer reads it back as zero batches so + // replay skips it. + let tailer = WalTailer::new(store.clone(), base_path.clone(), shard_id); + let entry = tailer.read_entry(sentinel_pos).await.unwrap().unwrap(); + assert_eq!(entry.writer_epoch, 2); + assert!(entry.batches.is_empty(), "sentinel must carry no batches"); + + // Successor's own writes land after the sentinel (position 3). + let res = second.append(vec![batch]).await.unwrap(); + assert_eq!(res.entry_position, 3); + } + #[tokio::test] async fn test_wal_appender_rejects_invalid_input() { let (store, base_path, _temp_dir) = create_local_store().await; diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 0788b657366..441da920b57 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -806,6 +806,8 @@ async fn replay_memtable_from_wal( position, entry.writer_epoch, our_epoch, shard_id ))); } + // Fence sentinels deserialize to zero batches and are skipped + // here — they carry only a position, no rows. if !entry.batches.is_empty() { memtable.insert_batches_only(entry.batches).await?; } @@ -1208,6 +1210,12 @@ impl ShardWriter { position_hint_seed, )); + // Fence the predecessor before replay (see `write_fence_sentinel`). + // Epoch 1 is a fresh shard with no predecessor to fence. + if epoch >= 2 { + wal_appender.write_fence_sentinel().await?; + } + // Create WAL flusher backed by the shared appender. let mut wal_flusher = WalFlusher::new(wal_appender); From 8369dbb03ebbfb4d13c1e1b00ff2b61ed81cff67 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 4 Jun 2026 16:16:23 -0700 Subject: [PATCH 027/177] chore: add dependabot config for weekly patch-level lockfile bumps (#7111) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Dependabot configuration to automatically open PRs for patch-level dependency updates every Wednesday. Uses `lockfile-only` mode for Cargo and uv so only the lockfiles are touched — no manifest version specs are modified. Covers: - `Cargo.lock` at `/`, `/python`, `/java/lance-jni` - `python/uv.lock` Maven is excluded since it has no lockfile concept (version specs live directly in `pom.xml`). Co-authored-by: Claude Sonnet 4.6 --- .github/dependabot.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000000..8a46f198419 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,30 @@ +version: 2 +updates: + - package-ecosystem: "cargo" + directory: "/" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "cargo" + directory: "/python" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "cargo" + directory: "/java/lance-jni" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + + - package-ecosystem: "uv" + directory: "/python" + versioning-strategy: lockfile-only + schedule: + interval: "weekly" + day: "wednesday" + From 5f4c0914b585a55e55dbbd476942a004c70a65e6 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Thu, 4 Jun 2026 16:40:34 -0700 Subject: [PATCH 028/177] perf(fts): defer DocSet load until wand actually needs it (#6983) Make `InvertedPartition::load` defer the per-partition `DocSet` work (row_id + num_tokens) until the wand walk actually needs it, instead of materializing the entire DocSet up front. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar.rs | 17 + rust/lance-index/src/scalar/inverted.rs | 1 + rust/lance-index/src/scalar/inverted/index.rs | 196 ++++++++-- .../src/scalar/inverted/lazy_docset.rs | 346 ++++++++++++++++++ .../lance-index/src/scalar/inverted/scorer.rs | 27 +- rust/lance-index/src/scalar/inverted/wand.rs | 63 +++- rust/lance-index/src/scalar/lance_format.rs | 79 ++++ rust/lance-select/src/mask.rs | 7 + rust/lance/src/dataset/tests/dataset_index.rs | 77 ++++ .../src/dataset/tests/dataset_merge_update.rs | 81 ++++ rust/lance/src/index.rs | 8 +- 11 files changed, 853 insertions(+), 49 deletions(-) create mode 100644 rust/lance-index/src/scalar/inverted/lazy_docset.rs diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 52a64d7d6c9..772dfaf4089 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -221,6 +221,23 @@ pub trait IndexReader: Send + Sync { range: std::ops::Range, projection: Option<&[&str]>, ) -> Result; + /// Read multiple ranges and concatenate into a single batch. + /// Default impl runs `read_range`s in parallel via `try_join_all`. + async fn read_ranges( + &self, + ranges: &[std::ops::Range], + projection: Option<&[&str]>, + ) -> Result { + if ranges.is_empty() { + return self.read_range(0..0, projection).await; + } + let futures = ranges + .iter() + .map(|r| self.read_range(r.clone(), projection)); + let batches = futures::future::try_join_all(futures).await?; + let schema = batches[0].schema(); + Ok(arrow_select::concat::concat_batches(&schema, &batches)?) + } /// Read a range of rows as a stream of record batches. /// /// This allows the caller to process rows incrementally without loading the diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs index 6adf4457f05..b3e497a82ca 100644 --- a/rust/lance-index/src/scalar/inverted.rs +++ b/rust/lance-index/src/scalar/inverted.rs @@ -7,6 +7,7 @@ mod encoding; mod index; mod iter; pub mod json; +mod lazy_docset; pub mod parser; pub mod query; mod scorer; diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 8acfffb3486..4304223592c 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -53,6 +53,7 @@ use tracing::{info, instrument}; use super::encoding::{PositionBlockBuilder, decode_group_starts}; use super::iter::PostingListIterator; +use super::lazy_docset::LazyDocSet; use super::{InvertedIndexBuilder, InvertedIndexParams, wand::*}; use super::{ builder::{ @@ -375,6 +376,35 @@ impl DeepSizeOf for InvertedIndex { } } +/// Resolve any `Pending` candidates that wand emitted via the +/// deferred-row_id path. After this returns, every entry in +/// `candidates` carries a real row_id. +async fn resolve_deferred_candidates( + docs: &LazyDocSet, + candidates: &mut [DocCandidate], +) -> Result<()> { + let pending: Vec = candidates + .iter() + .filter_map(|c| match c.addr { + CandidateAddr::Pending(d) => Some(d), + CandidateAddr::RowId(_) => None, + }) + .collect(); + if pending.is_empty() { + return Ok(()); + } + let mut iter = docs.resolve_row_ids(&pending).await?.into_iter(); + for c in candidates { + if matches!(c.addr, CandidateAddr::Pending(_)) { + let r = iter.next().ok_or_else(|| { + Error::internal("resolve_row_ids returned fewer items than requested") + })?; + c.addr = CandidateAddr::RowId(r); + } + } + Ok(()) +} + impl InvertedIndex { fn format_version(&self) -> InvertedListFormatVersion { self.partitions @@ -532,7 +562,7 @@ impl InvertedIndex { query_tokens: &Tokens, params: &FtsSearchParams, ) -> Result { - let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let (total_tokens, num_docs) = self.aggregate_corpus_stats().await?; let mut terms: Vec = Vec::new(); let mut seen = HashSet::new(); if matches!(params.fuzziness, Some(n) if n != 0) { @@ -555,18 +585,37 @@ impl InvertedIndex { let df = self.df_for_term(term).await?; token_docs.insert(term.clone(), df); } - Ok(MemBM25Scorer::new( - scorer.total_tokens(), - scorer.num_docs(), - token_docs, - )) + Ok(MemBM25Scorer::new(total_tokens, num_docs, token_docs)) } pub async fn bm25_stats_for_terms(&self, terms: &[String]) -> Result<(u64, usize, Vec)> { - let scorer = IndexBM25Scorer::new(self.partitions.iter().map(|part| part.as_ref())); + let (total_tokens, num_docs) = self.aggregate_corpus_stats().await?; let token_docs = futures::future::try_join_all(terms.iter().map(|term| self.df_for_term(term))).await?; - Ok((scorer.total_tokens(), scorer.num_docs(), token_docs)) + Ok((total_tokens, num_docs, token_docs)) + } + + /// Aggregate per-partition `total_tokens` and `num_docs` across the + /// index. `len` is cheap (no IO); `total_tokens_num` reads only the + /// num_tokens column the first time per partition and caches it on + /// `LazyDocSet`. Avoids materializing the full DocSet just to get + /// these two scalars. + async fn aggregate_corpus_stats(&self) -> Result<(u64, usize)> { + let io_parallelism = self.store.io_parallelism(); + let num_docs: usize = self.partitions.iter().map(|p| p.docs.len()).sum(); + let futures = self + .partitions + .iter() + .map(|p| { + let docs = p.docs.clone(); + async move { docs.total_tokens_num().await } + }) + .collect::>(); + let totals: Vec = stream::iter(futures) + .buffer_unordered(io_parallelism) + .try_collect() + .await?; + Ok((totals.into_iter().sum(), num_docs)) } /// Sum the posting-list length for `term` across this index's partitions @@ -671,8 +720,12 @@ impl InvertedIndex { .load_posting_lists(tokens.as_ref(), params.as_ref(), metrics.as_ref()) .await?; if postings.is_empty() { + // No hits in this partition; its DocSet stays + // unloaded, so we never pay the per-doc + // row_id/num_tokens download for it. return Result::Ok(PartitionCandidates::empty()); } + let docs_for_wand = part.docs.docs_for_wand(mask.as_ref()).await?; let max_position = postings .iter() .map(|posting| posting.term_index() as usize) @@ -686,8 +739,10 @@ impl InvertedIndex { let params = params.clone(); let mask = mask.clone(); let metrics = metrics.clone(); - spawn_cpu(move || { - let candidates = part.bm25_search( + let part_for_wand = part.clone(); + let mut partition_result = spawn_cpu(move || { + let candidates = part_for_wand.bm25_search( + docs_for_wand.as_ref(), params.as_ref(), operator, mask, @@ -695,12 +750,15 @@ impl InvertedIndex { metrics.as_ref(), shared_threshold, )?; - Ok(PartitionCandidates { + std::result::Result::<_, Error>::Ok(PartitionCandidates { tokens_by_position, candidates, }) }) - .await + .await?; + resolve_deferred_candidates(&part.docs, &mut partition_result.candidates) + .await?; + Result::Ok(partition_result) } }) .collect::>(); @@ -723,11 +781,21 @@ impl InvertedIndex { idf_by_position.push(idf_weight); } for DocCandidate { - row_id, + addr, freqs, doc_length, } in res.candidates { + // resolve_deferred_candidates ran upstream, so every + // candidate carries a real row_id at this point. + let row_id = match addr { + CandidateAddr::RowId(r) => r, + CandidateAddr::Pending(_) => { + return Err(Error::internal( + "bm25_search post-condition: deferred candidate left unresolved", + )); + } + }; let mut score = 0.0; for (term_index, freq) in freqs.into_iter() { debug_assert!((term_index as usize) < idf_by_position.len()); @@ -806,7 +874,7 @@ impl InvertedIndex { store, tokens, inverted_list, - docs, + docs: Arc::new(LazyDocSet::from_loaded(docs)), token_set_format: TokenSetFormat::Arrow, })], deleted_fragments: RoaringBitmap::new(), @@ -967,6 +1035,11 @@ impl InvertedIndex { part.inverted_list .prewarm_posting_lists(with_position) .await?; + // Materialize the deferred DocSet too: prewarm's contract is + // that subsequent queries do no IO, so the per-doc row_ids / + // num_tokens must be resident, not lazily faulted in at query + // time. `ensure_loaded` opens, reads, and drops the reader. + part.docs.ensure_loaded().await?; Result::Ok(()) }); stream::iter(prewarm_futures) @@ -1098,7 +1171,10 @@ pub struct InvertedPartition { store: Arc, pub(crate) tokens: TokenSet, pub(crate) inverted_list: Arc, - pub(crate) docs: DocSet, + /// Per-doc row_id + num_tokens. Wrapped in `LazyDocSet` so partitions + /// that don't contribute hits to a query never pay the full-array + /// download. Scoring paths call `ensure_loaded` before walking wand. + pub(crate) docs: Arc, token_set_format: TokenSetFormat, } @@ -1140,8 +1216,21 @@ impl InvertedPartition { let tokens = TokenSet::load(token_file, token_set_format).await?; let invert_list_file = store.open_index_file(&posting_file_path(id)).await?; let inverted_list = PostingListReader::try_new(invert_list_file, index_cache).await?; - let docs_file = store.open_index_file(&doc_file_path(id)).await?; - let docs = DocSet::load(docs_file, false, frag_reuse_index).await?; + // Defer the per-doc row_id/num_tokens read. Construction reads only + // the doc count (one footer read) and then drops the reader; the bulk + // load happens on first scoring use, re-opening the docs file on + // demand, and partitions that never score skip it entirely. Storing + // the store + path instead of an open reader keeps a cached partition + // from pinning a docs-file handle for its whole lifetime. + let docs_path = doc_file_path(id); + let num_docs = store.open_index_file(&docs_path).await?.num_rows(); + let docs = Arc::new(LazyDocSet::new( + store.clone(), + docs_path, + num_docs, + false, + frag_reuse_index, + )); Ok(Self { id, @@ -1252,8 +1341,13 @@ impl InvertedPartition { } #[instrument(level = "debug", skip_all)] + // Deferred-DocSet adds the `docs` param (caller materializes it) on top of + // the cross-partition `shared_threshold`, tipping this hot-path search fn + // one over the limit. Bundling args isn't worth the churn here. + #[allow(clippy::too_many_arguments)] pub fn bm25_search( &self, + docs: &DocSet, params: &FtsSearchParams, operator: Operator, mask: Arc, @@ -1265,12 +1359,13 @@ impl InvertedPartition { return Ok(Vec::new()); } - // let local_metrics = LocalMetricsCollector::default(); + // Caller selects the DocSet shape via `LazyDocSet::docs_for_wand` + // and passes it in here; wand uses `docs.has_row_ids()` to + // handle the num_tokens-only case. let scorer = IndexBM25Scorer::new(std::iter::once(self)); - let mut wand = Wand::new(operator, postings.into_iter(), &self.docs, scorer) + let mut wand = Wand::new(operator, postings.into_iter(), docs, scorer) .with_shared_threshold(shared_threshold); let hits = wand.search(params, mask, metrics)?; - // local_metrics.dump_into(metrics); Ok(hits) } @@ -1282,7 +1377,10 @@ impl InvertedPartition { self.inverted_list.posting_tail_codec(), ); builder.tokens = self.tokens.into_mutable(); - builder.docs = self.docs; + // into_builder rewrites every doc, so materialize the full + // DocSet now and clone it out of the Arc. + let docs_arc = self.docs.ensure_loaded().await?; + builder.docs = (*docs_arc).clone(); builder .posting_lists @@ -4242,13 +4340,27 @@ pub struct DocSet { impl DocSet { #[inline] pub fn len(&self) -> usize { - self.row_ids.len() + // Use num_tokens instead of row_ids so the deferred-row_ids + // scoring path (which constructs a DocSet via + // [`Self::from_num_tokens_only`]) still reports the right doc + // count. + self.num_tokens.len() } pub fn is_empty(&self) -> bool { self.len() == 0 } + /// True iff the per-doc `row_id` array is populated. The + /// deferred-row_id scoring path constructs DocSets with the array + /// left empty so wand can skip the load; callers that need to do + /// row_id lookups in the inner loop must check this and fall back + /// to async resolution otherwise. + #[inline] + pub fn has_row_ids(&self) -> bool { + !self.row_ids.is_empty() + } + pub fn iter(&self) -> impl Iterator { self.row_ids.iter().zip(self.num_tokens.iter()) } @@ -4338,7 +4450,35 @@ impl DocSet { let batch = reader.read_range(0..reader.num_rows(), None).await?; let row_id_col = batch[ROW_ID].as_primitive::(); let num_tokens_col = batch[NUM_TOKEN_COL].as_primitive::(); + Self::from_columns(row_id_col, num_tokens_col, is_legacy, frag_reuse_index) + } + /// Build a `DocSet` carrying only the per-doc `num_tokens` array; + /// `row_ids` and `inv` are left empty. Used by the deferred-row_id + /// scoring path: wand checks `has_row_ids()` to skip `row_id` / + /// `num_tokens_by_row_id` calls, and the per-partition caller + /// resolves doc_id → row_id for the surviving top-K post-wand. + pub fn from_num_tokens_only(num_tokens_col: &arrow_array::UInt32Array) -> Self { + let num_tokens = num_tokens_col.values().to_vec(); + let total_tokens = num_tokens.iter().map(|&n| n as u64).sum(); + Self { + row_ids: Vec::new(), + num_tokens, + inv: Vec::new(), + total_tokens, + } + } + + /// Build a `DocSet` from already-loaded `row_id` and `num_tokens` + /// arrow columns. Lets callers that have one column already in hand + /// (e.g. `LazyDocSet` after `total_tokens_num` pre-fetched + /// `num_tokens`) skip re-reading that column. + pub fn from_columns( + row_id_col: &UInt64Array, + num_tokens_col: &arrow_array::UInt32Array, + is_legacy: bool, + frag_reuse_index: Option>, + ) -> Result { // for legacy format, the row id is doc id; sorting keeps binary search viable if is_legacy { let (row_ids, num_tokens): (Vec<_>, Vec<_>) = row_id_col @@ -5814,9 +5954,13 @@ mod tests { } } + // Returns the `TempObjDir` guard so callers keep the backing store alive + // for the index's lifetime: the deferred DocSet re-opens the docs file on + // demand (it does not pin an open handle), so the files must still exist + // when the test exercises a scoring path. async fn load_counted_v2_index( num_tokens: usize, - ) -> (Arc, Arc) { + ) -> (Arc, Arc, TempObjDir) { let tmpdir = TempObjDir::default(); let inner_store = Arc::new(LanceIndexStore::new( ObjectStore::local().into(), @@ -5863,7 +6007,7 @@ mod tests { let index = InvertedIndex::load(counting_store, None, &LanceCache::no_cache()) .await .unwrap(); - (index, counter) + (index, counter, tmpdir) } /// IO regression test for the lazy posting-metadata refactor. Builds a @@ -5888,7 +6032,7 @@ mod tests { #[case::tokens_1000(1000)] #[tokio::test] async fn test_bm25_stats_for_terms_is_lazy(#[case] num_tokens: usize) { - let (index, counter) = load_counted_v2_index(num_tokens).await; + let (index, counter, _tmpdir) = load_counted_v2_index(num_tokens).await; assert!( !index.partitions[0].inverted_list.is_legacy_layout(), "this test only proves the lazy path for v2 indexes", @@ -5937,7 +6081,7 @@ mod tests { // total token count. let num_tokens = 500; let queried_tokens: [u32; 4] = [0, 1, 2, 3]; - let (index, counter) = load_counted_v2_index(num_tokens).await; + let (index, counter, _tmpdir) = load_counted_v2_index(num_tokens).await; let inverted_list = index.partitions[0].inverted_list.clone(); assert!( !inverted_list.is_legacy_layout(), diff --git a/rust/lance-index/src/scalar/inverted/lazy_docset.rs b/rust/lance-index/src/scalar/inverted/lazy_docset.rs new file mode 100644 index 00000000000..41e1c76473a --- /dev/null +++ b/rust/lance-index/src/scalar/inverted/lazy_docset.rs @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Deferred-load wrapper around [`DocSet`]. +//! +//! The inverted-index `DocSet` holds the per-doc `row_id` and `num_tokens` +//! arrays for a partition. Eager loading on partition open pulls roughly +//! 12 bytes × num_docs per partition; across thousands of partitions on +//! cold object storage that's tens of GiB of IO before a query has even +//! checked whether a partition contains the term it's looking for. +//! +//! [`LazyDocSet`] defers the load. Cheap sync getters (`len`, +//! `total_tokens_cached`) work without IO; async getters fetch on +//! demand and cache. Wand scoring still needs per-doc num_tokens, but +//! only partitions that actually contribute hits pay +//! `ensure_num_tokens_loaded`/`ensure_loaded`. + +use std::sync::Arc; + +use arrow::array::AsArray; +use arrow::datatypes::{UInt32Type, UInt64Type}; +use arrow_array::{UInt32Array, UInt64Array}; +use lance_core::ROW_ID; +use lance_core::Result; +use tokio::sync::OnceCell; + +use crate::frag_reuse::FragReuseIndex; +use crate::scalar::inverted::index::{DocSet, NUM_TOKEN_COL}; +use crate::scalar::{IndexReader, IndexStore}; +use lance_select::mask::RowAddrMask; + +/// Lazy view over an inverted-index partition's `DocSet`. +/// +/// Two variants: +/// - `Loaded`: a pre-materialized DocSet (legacy paths, tests). +/// Sync accessors return cached values; async accessors return +/// the same DocSet. +/// - `Deferred`: backed by an [`IndexReader`]; columns are read and +/// cached on first request. +pub enum LazyDocSet { + Loaded(LoadedDocSet), + Deferred(Box), +} + +/// Pre-materialized DocSet view -- no reader, no IO. +pub struct LoadedDocSet { + docs: Arc, + num_rows: usize, + total_tokens: u64, +} + +/// Store-backed DocSet view that loads on demand and caches. +/// +/// Holds the [`IndexStore`] and docs-file path rather than an open +/// [`IndexReader`], so a cached partition does not pin a docs-file +/// handle for its whole lifetime. The reader is re-opened on demand +/// inside each column accessor and dropped when that read completes; +/// because the resulting buffers are cached in the `OnceCell`s below, +/// a contributing partition re-opens only on a cold miss, and a +/// partition that never scores never opens the docs file at all after +/// construction. +pub struct DeferredDocSet { + store: Arc, + docs_path: String, + is_legacy: bool, + frag_reuse_index: Option>, + /// Doc count cached at construction so `len()` stays sync + IO-free. + num_rows: usize, + /// `sum(num_tokens)` cached on first compute. + total_tokens: OnceCell, + /// `NUM_TOKEN_COL` arrow buffer cached on first read. + num_tokens_col: OnceCell>, + /// `ROW_ID` arrow buffer cached on first read. + row_ids_col: OnceCell>, + /// Full DocSet, materialized on first `ensure_loaded`. + full: OnceCell>, +} + +impl std::fmt::Debug for LazyDocSet { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Loaded(l) => f + .debug_struct("LazyDocSet::Loaded") + .field("num_rows", &l.num_rows) + .field("total_tokens", &l.total_tokens) + .finish(), + Self::Deferred(d) => f + .debug_struct("LazyDocSet::Deferred") + .field("num_rows", &d.num_rows) + .field("total_tokens_loaded", &d.total_tokens.initialized()) + .field("full_loaded", &d.full.initialized()) + .finish(), + } + } +} + +impl deepsize::DeepSizeOf for LazyDocSet { + fn deep_size_of_children(&self, ctx: &mut deepsize::Context) -> usize { + match self { + Self::Loaded(l) => l.docs.deep_size_of_children(ctx), + Self::Deferred(d) => { + d.full + .get() + .map(|d| d.deep_size_of_children(ctx)) + .unwrap_or(0) + + d.num_tokens_col + .get() + .map(|arr| arr.len() * std::mem::size_of::()) + .unwrap_or(0) + + d.row_ids_col + .get() + .map(|arr| arr.len() * std::mem::size_of::()) + .unwrap_or(0) + } + } + } +} + +impl LazyDocSet { + pub fn new( + store: Arc, + docs_path: String, + num_rows: usize, + is_legacy: bool, + frag_reuse_index: Option>, + ) -> Self { + Self::Deferred(Box::new(DeferredDocSet { + store, + docs_path, + is_legacy, + frag_reuse_index, + num_rows, + total_tokens: OnceCell::new(), + num_tokens_col: OnceCell::new(), + row_ids_col: OnceCell::new(), + full: OnceCell::new(), + })) + } + + /// Wrap an already-materialized [`DocSet`]. Used by legacy paths + /// and tests that need to seed a partition without a reader. + pub fn from_loaded(docs: DocSet) -> Self { + let num_rows = docs.len(); + let total_tokens = docs.total_tokens_num(); + Self::Loaded(LoadedDocSet { + docs: Arc::new(docs), + num_rows, + total_tokens, + }) + } + + pub fn len(&self) -> usize { + match self { + Self::Loaded(l) => l.num_rows, + Self::Deferred(d) => d.num_rows, + } + } + + /// Sync read of cached `total_tokens`. Returns `None` for a + /// `Deferred` LazyDocSet that hasn't yet had any of + /// `total_tokens_num` / `ensure_num_tokens_loaded` / `ensure_loaded` + /// run. Used by sync scoring code that has already paid for one + /// of those async calls. + pub fn total_tokens_cached(&self) -> Option { + match self { + Self::Loaded(l) => Some(l.total_tokens), + Self::Deferred(d) => d.total_tokens.get().copied(), + } + } + + /// True if this DocSet carries a FragReuseIndex. Callers MUST + /// avoid the deferred-row_id path when this is set: targeted + /// row_id reads return raw stored ids, bypassing the per-id + /// `remap_row_id` filter that `DocSet::from_columns` applies. + pub fn has_frag_reuse_remap(&self) -> bool { + match self { + Self::Loaded(_) => false, + Self::Deferred(d) => d.frag_reuse_index.is_some(), + } + } + + /// Sum of `num_tokens` across all docs. + pub async fn total_tokens_num(&self) -> Result { + match self { + Self::Loaded(l) => Ok(l.total_tokens), + Self::Deferred(d) => d.total_tokens_num().await, + } + } + + /// Materialize the full DocSet, including row_ids. + pub async fn ensure_loaded(&self) -> Result> { + match self { + Self::Loaded(l) => Ok(l.docs.clone()), + Self::Deferred(d) => d.ensure_loaded().await, + } + } + + /// Materialize a DocSet that carries num_tokens but no row_ids. + /// Used by the deferred-row_id scoring path; the per-partition + /// caller resolves surviving doc_ids -> row_ids post-wand via + /// [`Self::resolve_row_ids`]. The result is NOT cached on the + /// LazyDocSet -- a later `ensure_loaded` must still produce a + /// full DocSet. + pub async fn ensure_num_tokens_loaded(&self) -> Result> { + match self { + Self::Loaded(l) => Ok(l.docs.clone()), + Self::Deferred(d) => d.ensure_num_tokens_loaded().await, + } + } + + /// Pick the right DocSet shape for a wand walk under `mask`: + /// the num_tokens-only deferred form when the mask is trivial + /// AND no FragReuseIndex needs to filter row_ids; otherwise the + /// full DocSet. Encapsulates the policy so callers don't have to + /// rederive the conditions for the targeted-read fast path. + pub async fn docs_for_wand(&self, mask: &RowAddrMask) -> Result> { + if mask.is_select_all() && !self.has_frag_reuse_remap() { + self.ensure_num_tokens_loaded().await + } else { + self.ensure_loaded().await + } + } + + /// Resolve a batch of `doc_id`s to their `row_id`s. Used by the + /// deferred-row_id scoring path to map post-wand top-K candidates + /// without going through a full DocSet build. + /// + /// Not safe with a FragReuseIndex (see + /// [`Self::has_frag_reuse_remap`]): the targeted reads return + /// raw stored ids without applying the remap/skip. + pub async fn resolve_row_ids(&self, doc_ids: &[u32]) -> Result> { + match self { + Self::Loaded(l) => Ok(doc_ids.iter().map(|&d| l.docs.row_id(d)).collect()), + Self::Deferred(d) => d.resolve_row_ids(doc_ids).await, + } + } +} + +impl DeferredDocSet { + /// Open a fresh docs-file reader. Dropped by the caller once its read + /// completes, so no handle is pinned across the partition's lifetime. + async fn reader(&self) -> Result> { + self.store.open_index_file(&self.docs_path).await + } + + async fn total_tokens_num(&self) -> Result { + if let Some(v) = self.total_tokens.get() { + return Ok(*v); + } + if let Some(full) = self.full.get() { + let v = full.total_tokens_num(); + let _ = self.total_tokens.set(v); + return Ok(v); + } + let col = self.num_tokens_column().await?; + let sum: u64 = col.values().iter().map(|&n| n as u64).sum(); + let _ = self.total_tokens.set(sum); + Ok(sum) + } + + async fn num_tokens_column(&self) -> Result> { + self.num_tokens_col + .get_or_try_init(|| async { + let reader = self.reader().await?; + let batch = reader + .read_range(0..self.num_rows, Some(&[NUM_TOKEN_COL])) + .await?; + Result::Ok(Arc::new( + batch[NUM_TOKEN_COL].as_primitive::().clone(), + )) + }) + .await + .cloned() + } + + async fn row_ids_column(&self) -> Result> { + self.row_ids_col + .get_or_try_init(|| async { + let reader = self.reader().await?; + let batch = reader.read_range(0..self.num_rows, Some(&[ROW_ID])).await?; + Result::Ok(Arc::new(batch[ROW_ID].as_primitive::().clone())) + }) + .await + .cloned() + } + + async fn ensure_loaded(&self) -> Result> { + let docs = self + .full + .get_or_try_init(|| async { + // If the stats path already pulled NUM_TOKEN_COL, + // read only ROW_ID and rebuild from the two columns. + let docs = if self.num_tokens_col.get().is_some() { + let num_tokens = self.num_tokens_column().await?; + let row_ids = self.row_ids_column().await?; + DocSet::from_columns( + row_ids.as_ref(), + num_tokens.as_ref(), + self.is_legacy, + self.frag_reuse_index.clone(), + )? + } else { + DocSet::load( + self.reader().await?, + self.is_legacy, + self.frag_reuse_index.clone(), + ) + .await? + }; + Result::Ok(Arc::new(docs)) + }) + .await? + .clone(); + let _ = self.total_tokens.set(docs.total_tokens_num()); + Ok(docs) + } + + async fn ensure_num_tokens_loaded(&self) -> Result> { + if let Some(full) = self.full.get() { + return Ok(full.clone()); + } + let num_tokens = self.num_tokens_column().await?; + let docs = Arc::new(DocSet::from_num_tokens_only(num_tokens.as_ref())); + let _ = self.total_tokens.set(docs.total_tokens_num()); + Ok(docs) + } + + async fn resolve_row_ids(&self, doc_ids: &[u32]) -> Result> { + if let Some(full) = self.full.get() + && full.has_row_ids() + { + return Ok(doc_ids.iter().map(|&d| full.row_id(d)).collect()); + } + if let Some(arr) = self.row_ids_col.get() { + return Ok(doc_ids.iter().map(|&d| arr.value(d as usize)).collect()); + } + let ranges: Vec> = doc_ids + .iter() + .map(|&d| d as usize..d as usize + 1) + .collect(); + let reader = self.reader().await?; + let batch = reader.read_ranges(&ranges, Some(&[ROW_ID])).await?; + let arr = batch[ROW_ID].as_primitive::(); + Ok((0..arr.len()).map(|i| arr.value(i)).collect()) + } +} diff --git a/rust/lance-index/src/scalar/inverted/scorer.rs b/rust/lance-index/src/scalar/inverted/scorer.rs index e3fb81871ef..eb7d78ff397 100644 --- a/rust/lance-index/src/scalar/inverted/scorer.rs +++ b/rust/lance-index/src/scalar/inverted/scorer.rs @@ -87,35 +87,36 @@ impl Scorer for MemBM25Scorer { pub struct IndexBM25Scorer<'a> { partitions: Vec<&'a InvertedPartition>, num_docs: usize, - total_tokens: u64, avg_doc_length: f32, } impl<'a> IndexBM25Scorer<'a> { + /// Sync constructor. Reads each partition's cached `total_tokens` via + /// `LazyDocSet::total_tokens_cached()`; callers must have already + /// populated it (via `ensure_loaded`, `ensure_num_tokens_loaded`, or + /// `total_tokens_num`). Panics with a clear message otherwise — this + /// is the wand-scoring path where the contract is statically known. pub fn new(partitions: impl Iterator) -> Self { let partitions = partitions.collect::>(); let num_docs = partitions.iter().map(|p| p.docs.len()).sum(); - let total_tokens = partitions + let total_tokens: u64 = partitions .iter() - .map(|part| part.docs.total_tokens_num()) - .sum::(); + .map(|p| { + p.docs.total_tokens_cached().expect( + "IndexBM25Scorer::new requires each partition's total_tokens to be \ + cached; call `ensure_loaded` / `ensure_num_tokens_loaded` / \ + `total_tokens_num` first", + ) + }) + .sum(); let avgdl = total_tokens as f32 / num_docs as f32; Self { partitions, num_docs, - total_tokens, avg_doc_length: avgdl, } } - pub fn num_docs(&self) -> usize { - self.num_docs - } - - pub fn total_tokens(&self) -> u64 { - self.total_tokens - } - pub fn num_docs_containing_token(&self, token: &str) -> usize { self.partitions .iter() diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 96cc8146fa6..6d607670aa9 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -435,9 +435,19 @@ impl PostingIterator { } } +/// How wand identified a candidate: either it already had the real +/// row_id (DocSet carried row_ids), or only the partition-local +/// doc_id (deferred-row_id path; the caller must resolve via +/// [`super::lazy_docset::LazyDocSet::resolve_row_ids`]). +#[derive(Debug, Clone, Copy)] +pub enum CandidateAddr { + RowId(u64), + Pending(u32), +} + #[derive(Debug)] pub struct DocCandidate { - pub row_id: u64, + pub addr: CandidateAddr, /// (term_index, freq) pub freqs: Vec<(u32, u32)>, pub doc_length: u32, @@ -698,6 +708,12 @@ impl<'a, S: Scorer> Wand<'a, S> { _ => {} } + // Deferred-row_id path: when the DocSet was built without + // row_ids, wand emits candidates carrying just the + // partition-local doc_id; the outer caller resolves them to + // row_ids post-wand. + let docs_has_row_ids = self.docs.has_row_ids(); + let mut candidates = BinaryHeap::with_capacity(std::cmp::min(limit, BLOCK_SIZE * 10)); let mut num_comparisons = 0; loop { @@ -707,14 +723,20 @@ impl<'a, S: Scorer> Wand<'a, S> { }; num_comparisons += 1; + // Either a real row_id (so we can run the mask check + // inline) or the doc_id widened to u64 (deferred path; + // the outer caller will resolve it post-wand). let row_id = match &doc { DocInfo::Raw(doc) => { - // if the doc is not located, we need to find the row id - self.docs.row_id(doc.doc_id) + if docs_has_row_ids { + self.docs.row_id(doc.doc_id) + } else { + doc.doc_id as u64 + } } DocInfo::Located(doc) => doc.row_id, }; - if !mask.selected(row_id) { + if docs_has_row_ids && !mask.selected(row_id) { if self.operator == Operator::Or { self.push_back_leads(doc.doc_id() + 1); } @@ -764,10 +786,21 @@ impl<'a, S: Scorer> Wand<'a, S> { } metrics.record_comparisons(num_comparisons); + // The heap entry's `row_id` slot is either a real row_id + // (DocSet had row_ids) or the doc_id widened to u64 + // (deferred). Tag it accordingly so the caller can match + // rather than guess. + let to_addr = |row_id_slot: u64| { + if docs_has_row_ids { + CandidateAddr::RowId(row_id_slot) + } else { + CandidateAddr::Pending(row_id_slot as u32) + } + }; Ok(candidates .into_iter() .map(|Reverse((doc, freqs, doc_length))| DocCandidate { - row_id: doc.row_id, + addr: to_addr(doc.row_id), freqs, doc_length, }) @@ -871,10 +904,12 @@ impl<'a, S: Scorer> Wand<'a, S> { } metrics.record_comparisons(num_comparisons); + // flat_search is driven by an explicit row_ids iterator, so + // every candidate already has a real row_id. Ok(candidates .into_iter() .map(|Reverse((doc, freqs, doc_length))| DocCandidate { - row_id: doc.row_id, + addr: CandidateAddr::RowId(doc.row_id), freqs, doc_length, }) @@ -1942,7 +1977,13 @@ mod tests { &NoOpMetricsCollector, ) .unwrap(); - let mut row_ids = hits.iter().map(|hit| hit.row_id).collect::>(); + let mut row_ids = hits + .iter() + .map(|hit| match hit.addr { + CandidateAddr::RowId(r) => r, + CandidateAddr::Pending(_) => panic!("row_id should be set in this path"), + }) + .collect::>(); row_ids.sort_unstable(); (row_ids, scored.load(Ordering::Relaxed)) }; @@ -2160,7 +2201,13 @@ mod tests { ) .unwrap(); - let matched = result.into_iter().map(|doc| doc.row_id).collect::>(); + let matched = result + .into_iter() + .map(|doc| match doc.addr { + CandidateAddr::RowId(r) => r, + CandidateAddr::Pending(_) => panic!("row_id should be set in this path"), + }) + .collect::>(); assert_eq!(matched, vec![2]); } diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 3be82def542..7fd9d12547d 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -239,6 +239,85 @@ impl IndexReader for current_reader::FileReader { Ok(batches[0].clone()) } + async fn read_ranges( + &self, + ranges: &[std::ops::Range], + projection: Option<&[&str]>, + ) -> Result { + let empty_batch = || { + Ok(RecordBatch::new_empty(Arc::new( + self.schema().as_ref().into(), + ))) + }; + if ranges.is_empty() { + return empty_batch(); + } + let projection = if let Some(projection) = projection { + ReaderProjection::from_column_names( + self.metadata().version(), + self.schema(), + projection, + )? + } else { + ReaderProjection::from_whole_schema(self.schema(), self.metadata().version()) + }; + // `DecodeBatchScheduler::schedule_ranges` requires sorted, + // non-overlapping ranges; sort internally and permute the + // result back to caller order so callers don't have to know. + let mut order: Vec = (0..ranges.len()).collect(); + order.sort_by_key(|&i| ranges[i].start); + let already_sorted = order.iter().enumerate().all(|(i, &j)| i == j); + let sorted_ranges: Arc<[std::ops::Range]> = order + .iter() + .map(|&i| ranges[i].start as u64..ranges[i].end as u64) + .collect(); + let total_rows: u64 = sorted_ranges.iter().map(|r| r.end - r.start).sum(); + let batches = self + .read_stream_projected( + ReadBatchParams::Ranges(sorted_ranges), + (total_rows as u32).max(1), + 16, + projection, + FilterExpression::no_filter(), + ) + .await? + .try_collect::>() + .await?; + let merged = match batches.len() { + 0 => return empty_batch(), + 1 => batches.into_iter().next().unwrap(), + _ => { + let schema = batches[0].schema(); + arrow_select::concat::concat_batches(&schema, &batches)? + } + }; + if already_sorted { + return Ok(merged); + } + let sorted_sizes: Vec = order + .iter() + .map(|&i| (ranges[i].end - ranges[i].start) as u32) + .collect(); + let mut sorted_offsets = Vec::with_capacity(sorted_sizes.len()); + let mut acc = 0u32; + for &s in &sorted_sizes { + sorted_offsets.push(acc); + acc += s; + } + let mut sorted_pos = vec![0usize; ranges.len()]; + for (sp, &oi) in order.iter().enumerate() { + sorted_pos[oi] = sp; + } + let mut take_indices = Vec::with_capacity(total_rows as usize); + for &sp in &sorted_pos { + for k in 0..sorted_sizes[sp] { + take_indices.push(sorted_offsets[sp] + k); + } + } + let take_arr = arrow_array::UInt32Array::from(take_indices); + Ok(arrow_select::take::take_record_batch(&merged, &take_arr)?) + } + async fn read_range_stream( &self, range: std::ops::Range, diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index 86a475866c9..a10ad9a6f50 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -78,6 +78,13 @@ impl RowAddrMask { } } + /// True if every row_id is selected. Lets callers (e.g. the FTS wand + /// loop) skip per-row mask checks entirely, which in turn lets the + /// deferred-row_id scoring path skip loading the row_id column. + pub fn is_select_all(&self) -> bool { + matches!(self, Self::BlockList(b) if b.is_empty()) + } + /// Return the indices of the input row ids that were valid pub fn selected_indices<'a>(&self, row_ids: impl Iterator + 'a) -> Vec { row_ids diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index e785de7bee4..beb6e2b99fd 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -1204,6 +1204,83 @@ async fn test_fts_rank() { assert_eq!(row_ids, &[0]); } +#[tokio::test] +async fn test_fts_unfiltered_after_filtered_returns_real_row_ids() { + // After a filtered FTS scan populates the per-partition cache, + // the next unfiltered scan must still return real row_ids, not + // partition-local doc_ids. Needs >1 fragment so the two differ + // (fragment N's row_ids start at N << 32). + let text_col = GenericStringArray::::from(vec![ + "alpha first", + "alpha second", + "alpha third", + "alpha fourth", + ]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![arrow_schema::Field::new( + "text", + text_col.data_type().to_owned(), + false, + )]) + .into(), + vec![Arc::new(text_col) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let test_uri = TempStrDir::default(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema), + &test_uri, + Some(WriteParams { + max_rows_per_file: 1, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + + let fts = |ds: &Dataset, filter: Option<&str>| { + let mut s = ds.scan(); + s.with_row_id() + .full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + if let Some(f) = filter { + s.prefilter(true).filter(f).unwrap(); + } + s + }; + let sorted_row_ids = |b: &RecordBatch| { + let mut v: Vec = b[ROW_ID].as_primitive::().values().to_vec(); + v.sort(); + v + }; + + let fresh = sorted_row_ids(&fts(&dataset, None).try_into_batch().await.unwrap()); + assert_eq!(fresh.len(), 4); + + // Reopen so the baseline scan's cached LazyDocSet doesn't mask + // the regression -- the filtered scan needs to be the first + // thing that touches the DocSet. + let dataset = Dataset::open(test_uri.as_str()).await.unwrap(); + fts(&dataset, Some("text LIKE 'alpha first%'")) + .try_into_batch() + .await + .unwrap(); + + let after = sorted_row_ids(&fts(&dataset, None).try_into_batch().await.unwrap()); + assert_eq!(after, fresh); +} + async fn create_fts_dataset< Offset: arrow::array::OffsetSizeTrait, ListOffset: arrow::array::OffsetSizeTrait, diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index 6374a0d2867..7fa03d6e78d 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -14,7 +14,9 @@ use crate::{Dataset, Error}; use lance_core::ROW_ADDR; use lance_index::IndexType; use lance_index::optimize::OptimizeOptions; +use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::ScalarIndexParams; +use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; use mock_instant::thread_local::MockClock; use crate::dataset::write::{InsertBuilder, WriteMode, WriteParams}; @@ -2631,3 +2633,82 @@ async fn test_sub_schema_merge_insert_binary_v2_2() { assert_eq!(binary_arr.value(0), data_a.as_slice()); assert_eq!(binary_arr.value(1), data_b.as_slice()); } + +#[tokio::test] +async fn test_fts_unfiltered_after_compaction_returns_remapped_row_ids() { + // After `compact_files` with `defer_index_remap = true`, queries + // read the old FTS index but must apply the dataset's + // FragReuseIndex remap. Otherwise the deferred-row_id path + // returns pre-compaction row_ids that no longer exist. + use arrow::datatypes::UInt64Type; + + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("text", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![0, 1, 2, 3])), + Arc::new(StringArray::from(vec![ + "alpha first", + "alpha second", + "alpha third", + "alpha fourth", + ])), + ], + ) + .unwrap(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![Ok(batch)], schema), + "memory://test_fts_frag_reuse", + Some(WriteParams { + max_rows_per_file: 1, // 4 fragments -> 4 partitions + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 1000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let after = dataset + .scan() + .with_row_id() + .full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(after.num_rows(), 4); + let returned: Vec = after[ROW_ID].as_primitive::().values().to_vec(); + let live: std::collections::HashSet = + dataset.scan().with_row_id().try_into_batch().await.unwrap()[ROW_ID] + .as_primitive::() + .values() + .iter() + .copied() + .collect(); + for id in &returned { + assert!(live.contains(id), "stale row_id {id}"); + } +} diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 07033a74616..81b0ddee407 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -7513,12 +7513,16 @@ mod tests { .unwrap(); assert!(results.num_rows() > 0); - // Verify IOPs + // Verify IOPs. The deferred DocSet loads per-doc num_tokens/row_ids on + // first use rather than eagerly at index open, so a cold (un-prewarmed) + // query opens the docs file on demand — a couple more IOPs than the + // eager path, but constant and only on the first query (prewarm or a + // warm cache serve it with zero IO). let stats = dataset.object_store.as_ref().io_stats_incremental(); assert_io_lt!( stats, read_iops, - 15, + 18, "Inverted index query should use minimal IOPs" ); } From 12f529f90f1d839dc9f205db38bb5cba49e20be5 Mon Sep 17 00:00:00 2001 From: Vivek Date: Thu, 4 Jun 2026 21:58:22 -0700 Subject: [PATCH 029/177] fix(fts): reset TokenSet next_id and total_length after remap (#7115) During an FTS index build, merge_existing_segments() can panic with something like "index out of bounds: the len is 936 but the index is 1077". This happens when there has been an update/delete on the table that causes some token to be evicted from the posting lists. Fixed up updating the associated bookkeeping --- .../src/scalar/inverted/builder.rs | 38 +++++++++++++++++++ rust/lance-index/src/scalar/inverted/index.rs | 11 +++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 7c347b50af3..437ea30a730 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -3091,6 +3091,44 @@ mod tests { Ok(()) } + #[test] + fn test_merge_from_after_remap_does_not_panic() { + // `first` is the merge accumulator. Give it three tokens, then remap away the + // middle one, mirroring filter_old_data dropping a token whose postings emptied. + let mut first = InnerBuilder::new(0, false, TokenSetFormat::default()); + for token in ["a", "b", "c"] { + first.tokens.add(token.to_owned()); + } + first + .posting_lists + .resize_with(first.tokens.len(), || PostingListBuilder::new(false)); + let first_doc = first.docs.append(10, 1); + first.posting_lists[0].add(first_doc, PositionRecorder::Count(1)); // "a" + first.posting_lists[2].add(first_doc, PositionRecorder::Count(1)); // "c" + + // Remove token "b" (id 1) and compact its (empty) posting list to match. + first.tokens.remap(&[1]); + first.posting_lists.remove(1); + assert_eq!(first.tokens.len(), first.posting_lists.len()); + + // `second` contributes a brand-new token absent from `first`. Before the fix, + // get_or_add returned the stale next_id, indexing past posting_lists. + let mut second = InnerBuilder::new(1, false, TokenSetFormat::default()); + let zeta = second.tokens.add("zeta".to_owned()); + second + .posting_lists + .resize_with(second.tokens.len(), || PostingListBuilder::new(false)); + let second_doc = second.docs.append(20, 1); + second.posting_lists[zeta as usize].add(second_doc, PositionRecorder::Count(1)); + + first.merge_from(second).unwrap(); + + assert_eq!(first.tokens.len(), 3); + assert_eq!(first.posting_lists.len(), 3); + let zeta_id = first.tokens.get("zeta").expect("zeta should be merged in"); + assert!((zeta_id as usize) < first.posting_lists.len()); + } + #[tokio::test] async fn test_update_index_returns_worker_error_when_workers_exit_during_dispatch() { let num_batches = (*LANCE_FTS_NUM_SHARDS * 2 + 1) as u64; diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 4304223592c..a3e91f0f4c7 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -1736,17 +1736,26 @@ impl TokenSet { } }; + let mut retained_length = 0; map.retain( - |_, token_id| match removed_token_ids.binary_search(token_id) { + |token, token_id| match removed_token_ids.binary_search(token_id) { Ok(_) => false, Err(index) => { *token_id -= index as u32; + retained_length += token.len(); true } }, ); self.tokens = TokenMap::HashMap(map); + + // The retain above compacts the surviving token ids into a dense `[0, len)` + // range, so `next_id` (handed to the next new token) must follow them down. + // `total_length` likewise must drop the removed tokens' bytes; it is persisted + // and feeds memory accounting, so a stale value drifts across remap/merge cycles. + self.next_id = self.tokens.len() as u32; + self.total_length = retained_length; } pub fn next_id(&self) -> u32 { From 1f7e8f77908bde6e3d94c2ac1cf1175d7821e3ba Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Fri, 5 Jun 2026 14:09:49 +0800 Subject: [PATCH 030/177] feat(index): support multi-bit IVF_RQ storage (#7038) ## Feature This PR adds lower-level IVF_RQ multi-bit quantization/storage preparation for `num_bits=2..9` while preserving the existing `num_bits=1` behavior. It implements RabitQ split-code generation using the existing 1-bit sign code plus persisted ex-code storage, wires the PR1 split-code metadata/storage scaffolding through the index build path, and adds factor calculation for Lance's IVF residual semantics. Public index creation and search for `num_bits>1` remain explicitly gated until the split-code query/FastScan path lands in a follow-up PR. Old IVF_RQ indexes and `num_bits=1` behavior remain compatible. `num_bits=9` uses the split layout as `1 sign bit + 8 ex-code bits`; the ex-code remains stored in `__ex_codes` as `uint8` bytes, matching RaBitQ-Library's `ex_bits=8` compact-code path. ## Why This is PR2 in the IVF_RQ split-code storage preparation. It keeps the lower-level multi-bit quantizer/storage pieces in place and validates schema round-trip behavior so PR3 can enable the public creation and query-side split-code scan path together. ## Validation - `cargo fmt --all` - `cargo clippy --all --tests --benches -- -D warnings` - `cargo test -p lance-index vector::bq::builder` - `cargo test -p lance-index bq::transform::tests::test_rq_transformer_writes_multi_bit_ex_scale_factors` - `cargo test -p lance-index bq::storage` - `cargo test -p lance-index test_merge_ivf_rq_multi_bit_preserves_split_columns` - `cargo test -p lance test_build_ivf_rq_multi_bit_persists_split_codes_and_gates_search` (compiles and reports the test as ignored because public `num_bits>1` creation is gated until query support lands) - `make install` from `python/` - `uv run make lint` from `python/` was attempted locally; ruff passed, but pyright was blocked by the local macOS Python environment selecting Python 3.14 while the project pyright config targets Python 3.13 and optional `tensorflow`/`torch` imports are not available in that local environment. Per request, this PR relies on GitHub Actions for the full Python validation. ## Benchmark Ran on GCP VM `yang-agent-ivfrq-nbits-20260601` (`c4-standard-16`, 512GB disk), using `search-benchmark` at `61ef8f7b97589032a83eeae1e52664be9f035551`. Branch commit at measurement time: `38239151aa148566bf80e158f358814815e64a62` Base main commit: `d8415d3ab15fb17a2315d7c312d9bee52106b810` Query benchmark for `num_bits=1` stayed essentially flat against main across `sift1m`, `gist`, and `dbpedia`. Average QPS over the query configs was: | dataset | main QPS | branch QPS | |---|---:|---:| | sift1m | 2079.6 | 2068.8 | | gist | 1842.2 | 1799.1 | | dbpedia | 1709.7 | 1709.2 | `num_bits>1` query is intentionally unsupported in this PR and fails with the explicit gate. The lower-level build-only measurements below were taken before restoring the public creation gate; they are included only as storage-prep signal, not as a supported public creation path. | dataset | bits | build s | index overhead MiB | |---|---:|---:|---:| | sift1m | 1 | 1.950 | 30.7 | | sift1m | 4 | 3.430 | 84.1 | | sift1m | 5 | 4.066 | 99.3 | | sift1m | 7 | 5.602 | 129.8 | | gist | 1 | 10.097 | 130.6 | | gist | 4 | 18.293 | 481.6 | | gist | 5 | 21.619 | 596.0 | | gist | 7 | 32.525 | 824.9 | | dbpedia | 1 | 14.959 | 194.8 | | dbpedia | 4 | 28.865 | 737.9 | | dbpedia | 5 | 32.807 | 916.4 | | dbpedia | 7 | 49.557 | 1273.4 | --- docs/src/format/index/vector/index.md | 27 +- .../tests/compat/test_vector_indices.py | 12 + python/python/tests/test_vector_index.py | 30 +- rust/lance-index/src/vector/bq.rs | 17 +- rust/lance-index/src/vector/bq/builder.rs | 372 +++++++++++- rust/lance-index/src/vector/bq/storage.rs | 217 ++++++- rust/lance-index/src/vector/bq/transform.rs | 566 +++++++++++++++--- .../src/vector/distributed/index_merger.rs | 192 +++++- rust/lance-index/src/vector/ivf.rs | 16 +- rust/lance/src/index/vector.rs | 17 +- rust/lance/src/index/vector/ivf/v2.rs | 90 ++- 11 files changed, 1384 insertions(+), 172 deletions(-) diff --git a/docs/src/format/index/vector/index.md b/docs/src/format/index/vector/index.md index e565bd737f0..3b209934f64 100644 --- a/docs/src/format/index/vector/index.md +++ b/docs/src/format/index/vector/index.md @@ -192,12 +192,15 @@ Compresses vectors using scalar quantization for moderate memory savings: Compresses vectors using RabitQ with random rotation and binary quantization for extreme compression: -| Column | Type | Nullable | Description | -| ----------------- | -------------------------- | -------- | --------------------------------------------------------------- | -| `_rowid` | uint64 | false | Row identifier | -| `_rabit_codes` | list[dimension / 8] | false | Binary quantized codes (1 bit per dimension, packed into bytes) | -| `__add_factors` | float32 | false | Additive correction factors for distance computation | -| `__scale_factors` | float32 | false | Scale correction factors for distance computation | +| Column | Type | Nullable | Description | +| -------------------- | ------------------------------------------------ | ------------------------ | --------------------------------------------------------------- | +| `_rowid` | uint64 | false | Row identifier | +| `_rabit_codes` | list[dimension / 8] | false | Binary quantized codes (1 bit per dimension, packed into bytes) | +| `__add_factors` | float32 | false | Additive correction factors for distance computation | +| `__scale_factors` | float32 | false | Scale correction factors for distance computation | +| `__ex_codes` | list[ceil(dimension * (num_bits - 1) / 8)] | false for `num_bits > 1` | Extra RabitQ code bits for multi-bit RQ | +| `__add_factors_ex` | float32 | false for `num_bits > 1` | Additive correction factors for ex-code distance computation | +| `__scale_factors_ex` | float32 | false for `num_bits > 1` | Scale correction factors for ex-code distance computation | #### Arrow Schema Metadata @@ -248,7 +251,7 @@ For **RabitQ (RQ)**: | JSON Key | Type | Description | | --------------------- | ---- | ---------------------------------------------------- | | `rotate_mat_position` | u32 | Position of the rotation matrix in the global buffer | -| `num_bits` | u8 | Number of bits per dimension (currently always 1) | +| `num_bits` | u8 | Number of bits per dimension, in the range 1..=9 | | `code_dim` | u32 | Rotated vector dimension for the 1-bit binary code | | `packed` | bool | Whether codes are packed for optimized computation | @@ -274,9 +277,10 @@ to rotate vectors before binary quantization: ``` The rotation matrix has shape `[code_dim, code_dim]` where `code_dim` is the rotated vector -dimension. Current IVF_RQ stores the 1-bit binary code in `_rabit_codes`; future multi-bit support -will store the remaining `num_bits - 1` ex-code bits separately instead of widening this binary -code path. +dimension. IVF_RQ always stores the 1-bit binary sign code in `_rabit_codes`; for `num_bits > 1`, +the remaining `num_bits - 1` ex-code bits are stored in `__ex_codes` instead of widening the +binary code path. `num_bits=1` indexes only store the binary-code factor columns; multi-bit indexes +also store separate ex-code additive and scale factors. ## Appendices @@ -319,7 +323,8 @@ pa.schema([ ### Appendix 2: Example IVF_RQ Format This example shows how an `IVF_RQ` index is physically laid out. Assume vectors have dimension 128, -RQ uses 1 bit per dimension (num_bits=1), and distance type is "l2". +RQ uses 1 bit per dimension (`num_bits=1`), and distance type is "l2". For `num_bits > 1`, the +auxiliary schema also includes `__ex_codes`, `__add_factors_ex`, and `__scale_factors_ex`. #### Index File diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py index b98ffdf63e3..e381a3ce554 100644 --- a/python/python/tests/compat/test_vector_indices.py +++ b/python/python/tests/compat/test_vector_indices.py @@ -8,6 +8,7 @@ can be read and written by other versions. """ +import os import shutil from pathlib import Path @@ -268,6 +269,11 @@ class IvfRqVectorIndex(UpgradeDowngradeTest): def __init__(self, path: Path): self.path = path + def current_env(self, method_name: str): + if method_name == "check_read": + return {"LANCE_COMPAT_CURRENT_RUNTIME": "1"} + return {} + def create(self): """Create dataset with IVF_RQ vector index.""" shutil.rmtree(self.path, ignore_errors=True) @@ -319,6 +325,12 @@ def check_read(self): stats = ds.stats.index_stats(name) assert stats["num_indexed_rows"] > 0 + if os.environ.get("LANCE_COMPAT_CURRENT_RUNTIME") == "1": + # Old 1-bit IVF_RQ indexes do not have split ex-code columns. + # The successful query above verifies the current reader does not + # require them. + sub_index = stats["indices"][0]["sub_index"] + assert sub_index["num_bits"] == 1 def check_write(self): """Verify can insert vectors and run optimize workflows.""" diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 047b94bf8ac..8a3a85bf6e7 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1067,15 +1067,31 @@ def test_create_ivf_rq_skip_transpose(): assert stats["indices"][0]["sub_index"]["packed"] is False -def test_create_ivf_rq_rejects_unsupported_num_bits(): +@pytest.mark.skip( + reason=( + "IVF_RQ num_bits>1 creation is gated until split-code search support " + "is implemented" + ) +) +def test_create_ivf_rq_multi_bit_gates_search(): ds = lance.write_dataset(create_table(), "memory://") - with pytest.raises(NotImplementedError, match="only num_bits=1 is supported"): - ds.create_index( - "vector", - index_type="IVF_RQ", - num_partitions=4, - num_bits=2, + ds = ds.create_index( + "vector", + index_type="IVF_RQ", + num_partitions=4, + num_bits=9, + ) + stats = ds.stats.index_stats("vector_idx") + assert stats["indices"][0]["sub_index"]["num_bits"] == 9 + + with pytest.raises(pa.ArrowInvalid, match="num_bits>1 search is not supported"): + ds.to_table( + nearest={ + "column": "vector", + "q": np.random.randn(128).astype(np.float32), + "k": 10, + } ) diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index 51439e2c905..8a347f48817 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -23,7 +23,7 @@ pub mod storage; pub mod transform; pub const RABIT_MIN_NUM_BITS: u8 = 1; -pub const RABIT_MAX_NUM_BITS: u8 = 8; +pub const RABIT_MAX_NUM_BITS: u8 = 9; pub const RABIT_BINARY_NUM_BITS: u8 = 1; #[derive(Clone, Default)] @@ -131,7 +131,7 @@ pub fn validate_supported_rq_num_bits(num_bits: u8) -> Result<()> { validate_rq_num_bits(num_bits)?; if num_bits != RABIT_BINARY_NUM_BITS { return Err(Error::not_supported(format!( - "IVF_RQ num_bits={} is not supported yet; only num_bits=1 is supported", + "IVF_RQ num_bits={} index creation is not supported until split-code search support is implemented", num_bits ))); } @@ -244,7 +244,7 @@ mod tests { #[test] fn test_rabit_num_bits_validation() { validate_rq_num_bits(1).unwrap(); - validate_rq_num_bits(8).unwrap(); + validate_rq_num_bits(9).unwrap(); let err = validate_rq_num_bits(0).unwrap_err(); assert!( @@ -253,16 +253,18 @@ mod tests { err ); - let err = validate_rq_num_bits(9).unwrap_err(); + let err = validate_rq_num_bits(10).unwrap_err(); assert!( err.to_string().contains("IVF_RQ num_bits must be in"), "{}", err ); - let err = validate_supported_rq_num_bits(2).unwrap_err(); + validate_supported_rq_num_bits(1).unwrap(); + let err = validate_supported_rq_num_bits(9).unwrap_err(); assert!( - err.to_string().contains("only num_bits=1 is supported"), + err.to_string() + .contains("num_bits=9 index creation is not supported"), "{}", err ); @@ -271,13 +273,14 @@ mod tests { #[test] fn test_rabit_split_code_byte_sizing() { assert_eq!(rabit_ex_bits(1).unwrap(), 0); - assert_eq!(rabit_ex_bits(8).unwrap(), 7); + assert_eq!(rabit_ex_bits(9).unwrap(), 8); assert_eq!(rabit_binary_code_bytes(128), 16); assert_eq!(rabit_binary_code_bytes(129), 17); assert_eq!(rabit_ex_code_bytes(128, 0).unwrap(), 0); assert_eq!(rabit_ex_code_bytes(128, 3).unwrap(), 48); + assert_eq!(rabit_ex_code_bytes(128, 8).unwrap(), 128); assert_eq!(rabit_ex_code_bytes(129, 3).unwrap(), 49); } } diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index 70e084472d7..df6e6591299 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -18,13 +18,15 @@ use rayon::prelude::*; use crate::vector::bq::storage::{ RABIT_CODE_COLUMN, RABIT_METADATA_KEY, RabitQuantizationMetadata, RabitQuantizationStorage, - rabit_binary_code_field, + rabit_binary_code_field, rabit_ex_code_field, +}; +use crate::vector::bq::transform::{ + ADD_FACTORS_FIELD, EX_ADD_FACTORS_FIELD, EX_SCALE_FACTORS_FIELD, SCALE_FACTORS_FIELD, }; -use crate::vector::bq::transform::{ADD_FACTORS_FIELD, SCALE_FACTORS_FIELD}; use crate::vector::bq::{ - RQBuildParams, RQRotationType, rabit_binary_code_bytes, + RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rabit_ex_code_bytes, rotation::{apply_fast_rotation, fast_rotation_signs_len, random_fast_rotation_signs}, - validate_supported_rq_num_bits, + validate_rq_num_bits, }; use crate::vector::quantizer::{Quantization, Quantizer, QuantizerBuildParams}; @@ -57,6 +59,14 @@ pub struct RabitQuantizer { metadata: RabitQuantizationMetadata, } +pub(crate) struct RabitQuantizedBatch { + pub binary_codes: ArrayRef, + pub ex_codes: Option, + pub ex_res_dot_dists: Option>, + pub rotated_residuals: Option>, + pub ex_code_values: Option>, +} + #[inline] fn pack_sign_bits(codes: &mut [u8], rotated: &[f32]) { codes.fill(0); @@ -67,6 +77,132 @@ fn pack_sign_bits(codes: &mut [u8], rotated: &[f32]) { } } +#[inline] +fn pack_ex_code_bits(codes: &mut [u8], ex_values: &[u8], ex_bits: u8) { + codes.fill(0); + let ex_bits = ex_bits as usize; + for (dim_idx, &value) in ex_values.iter().enumerate() { + let bit_offset = dim_idx * ex_bits; + for bit_idx in 0..ex_bits { + if (value >> bit_idx) & 1 != 0 { + let dst_bit = bit_offset + bit_idx; + codes[dst_bit / u8::BITS as usize] |= 1u8 << (dst_bit % u8::BITS as usize); + } + } + } +} + +const EX_QUANTIZATION_EPSILON: f32 = 1.0e-5; +const EX_TIGHT_START: [f32; 9] = [0.0, 0.15, 0.20, 0.52, 0.59, 0.71, 0.75, 0.77, 0.81]; + +fn best_ex_rescale_factor(abs_normalized: &[f32], ex_bits: u8) -> f32 { + let max_value = abs_normalized + .iter() + .copied() + .filter(|value| value.is_finite()) + .fold(0.0f32, f32::max); + if max_value <= 0.0 { + return 0.0; + } + + let max_code = (1usize << ex_bits) - 1; + let t_end = ((max_code + 10) as f32) / max_value; + let t_start = t_end * EX_TIGHT_START[ex_bits as usize]; + + let mut current_codes = Vec::with_capacity(abs_normalized.len()); + let mut squared_denominator = abs_normalized.len() as f32 * 0.25; + let mut numerator = 0.0f32; + let mut thresholds = Vec::with_capacity(abs_normalized.len() * max_code); + + for (idx, &value) in abs_normalized.iter().enumerate() { + if value <= 0.0 || !value.is_finite() { + current_codes.push(0usize); + continue; + } + + let current = ((t_start * value) + EX_QUANTIZATION_EPSILON) + .floor() + .clamp(0.0, max_code as f32) as usize; + current_codes.push(current); + squared_denominator += (current * current + current) as f32; + numerator += (current as f32 + 0.5) * value; + + let mut next = current + 1; + while next <= max_code { + let threshold = next as f32 / value; + if threshold < t_end { + thresholds.push((threshold, idx)); + } + next += 1; + } + } + + thresholds.sort_unstable_by(|(left, _), (right, _)| left.total_cmp(right)); + + let mut best_inner_product = numerator / squared_denominator.sqrt(); + let mut best_t = t_start; + for (threshold, idx) in thresholds { + current_codes[idx] += 1; + let updated = current_codes[idx]; + squared_denominator += (2 * updated) as f32; + numerator += abs_normalized[idx]; + + let current_inner_product = numerator / squared_denominator.sqrt(); + if current_inner_product > best_inner_product { + best_inner_product = current_inner_product; + best_t = threshold; + } + } + + best_t +} + +fn quantize_ex_code( + rotated: &[f32], + ex_bits: u8, + ex_code_dst: &mut [u8], + ex_code_values_dst: &mut [u8], +) -> f32 { + debug_assert_eq!(rotated.len(), ex_code_values_dst.len()); + let norm_squared = rotated.iter().map(|value| value * value).sum::(); + if norm_squared <= f32::EPSILON || !norm_squared.is_finite() { + ex_code_dst.fill(0); + ex_code_values_dst.fill(0); + return 0.0; + } + + let norm = norm_squared.sqrt(); + let abs_normalized = rotated + .iter() + .map(|value| value.abs() / norm) + .collect::>(); + let t = best_ex_rescale_factor(&abs_normalized, ex_bits); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let mask = max_code; + let code_bias = -((1u32 << ex_bits) as f32 - 0.5); + let mut residual_dot_code = 0.0f32; + + for ((&value, &abs_value), ex_code_value) in rotated + .iter() + .zip(abs_normalized.iter()) + .zip(ex_code_values_dst.iter_mut()) + { + let mut ex_code = ((t * abs_value) + EX_QUANTIZATION_EPSILON) + .floor() + .clamp(0.0, max_code as f32) as u8; + if value.is_sign_negative() { + ex_code = (!ex_code) & mask; + } + let sign_code = u8::from(value.is_sign_positive()); + let full_code = ((sign_code as u32) << ex_bits) + ex_code as u32; + residual_dot_code += value * (full_code as f32 + code_bias); + *ex_code_value = ex_code; + } + + pack_ex_code_bits(ex_code_dst, ex_code_values_dst, ex_bits); + residual_dot_code +} + impl RabitQuantizer { pub fn new(num_bits: u8, dim: i32) -> Self { Self::new_with_rotation::(num_bits, dim, RQRotationType::default()) @@ -188,6 +324,60 @@ impl RabitQuantizer { } } + pub(crate) fn rotate_fsl_to_f32(&self, vectors: &FixedSizeListArray) -> Result> { + match vectors.value_type() { + DataType::Float16 => self.rotate_fsl_to_f32_typed::(vectors), + DataType::Float32 => self.rotate_fsl_to_f32_typed::(vectors), + DataType::Float64 => self.rotate_fsl_to_f32_typed::(vectors), + value_type => Err(Error::invalid_input(format!( + "Unsupported data type: {:?}", + value_type + ))), + } + } + + fn rotate_fsl_to_f32_typed( + &self, + vectors: &FixedSizeListArray, + ) -> Result> + where + T::Native: AsPrimitive + Sync, + { + let dim = self.dim(); + if vectors.value_length() as usize != dim { + return Err(Error::invalid_input(format!( + "Vector dimension mismatch: {} != {}", + vectors.value_length(), + dim + ))); + } + let values = vectors + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::invalid_input(format!( + "Vector values have unexpected data type: {}", + vectors.value_type() + )) + })? + .as_slice(); + let vec_mat = ndarray::ArrayView2::from_shape((vectors.len(), dim), values) + .map_err(|e| Error::invalid_input(e.to_string()))?; + let rotated = self.rotate_vectors::(vec_mat.t()); + let code_dim = self.code_dim(); + let mut row_major = vec![0.0f32; vectors.len() * code_dim]; + for row_idx in 0..vectors.len() { + for (dst, value) in row_major[row_idx * code_dim..(row_idx + 1) * code_dim] + .iter_mut() + .zip(rotated.column(row_idx).iter()) + { + *dst = *value; + } + } + Ok(row_major) + } + pub fn dim(&self) -> usize { self.code_dim() } @@ -209,7 +399,7 @@ impl RabitQuantizer { ))); } - let sqrt_dim = (dim as f32 * self.metadata.num_bits as f32).sqrt(); + let sqrt_dim = (dim as f32).sqrt(); let values = residual_vectors .values() .as_any() @@ -308,6 +498,131 @@ impl RabitQuantizer { } } } + + pub(crate) fn quantize_split( + &self, + vectors: &FixedSizeListArray, + ) -> Result { + match vectors.value_type() { + DataType::Float16 => self.transform_split::(vectors), + DataType::Float32 => self.transform_split::(vectors), + DataType::Float64 => self.transform_split::(vectors), + value_type => Err(Error::invalid_input(format!( + "Unsupported data type: {:?}", + value_type + ))), + } + } + + fn transform_split( + &self, + residual_vectors: &FixedSizeListArray, + ) -> Result + where + T::Native: AsPrimitive + Sync, + { + let ex_bits = rabit_ex_bits(self.metadata.num_bits)?; + if ex_bits == 0 { + return Ok(RabitQuantizedBatch { + binary_codes: self.transform::(residual_vectors)?, + ex_codes: None, + ex_res_dot_dists: None, + rotated_residuals: None, + ex_code_values: None, + }); + } + + let n = residual_vectors.len(); + let dim = self.dim(); + debug_assert_eq!(residual_vectors.values().len(), n * dim); + let values = residual_vectors + .values() + .as_any() + .downcast_ref::() + .unwrap() + .as_slice(); + let code_dim = self.code_dim(); + let code_bytes = rabit_binary_code_bytes(code_dim); + let ex_code_bytes = rabit_ex_code_bytes(code_dim, ex_bits)?; + + let mut encoded_codes = vec![0u8; n * code_bytes]; + let mut encoded_ex_codes = vec![0u8; n * ex_code_bytes]; + let mut ex_res_dot_dists = vec![0.0f32; n]; + let mut rotated_residuals = vec![0.0f32; n * code_dim]; + let mut ex_code_values = vec![0u8; n * code_dim]; + + match self.rotation_type() { + RQRotationType::Matrix => { + let vectors = ndarray::ArrayView2::from_shape((n, dim), values) + .map_err(|e| Error::invalid_input(e.to_string()))?; + let vectors = vectors.t(); + let rotated_vectors = self.rotate_vectors::(vectors); + + encoded_codes + .chunks_mut(code_bytes) + .zip(encoded_ex_codes.chunks_mut(ex_code_bytes)) + .zip(rotated_residuals.chunks_mut(code_dim)) + .zip(ex_code_values.chunks_mut(code_dim)) + .zip(ex_res_dot_dists.iter_mut()) + .enumerate() + .for_each( + |( + row_idx, + ((((code_dst, ex_dst), rotated_dst), ex_values_dst), ex_dot_dst), + )| { + for (dst, value) in rotated_dst + .iter_mut() + .zip(rotated_vectors.column(row_idx).iter()) + { + *dst = *value; + } + pack_sign_bits(code_dst, rotated_dst); + *ex_dot_dst = + quantize_ex_code(rotated_dst, ex_bits, ex_dst, ex_values_dst); + }, + ); + } + RQRotationType::Fast => { + let signs = self.fast_rotation_signs(); + encoded_codes + .par_chunks_mut(code_bytes) + .zip(encoded_ex_codes.par_chunks_mut(ex_code_bytes)) + .zip(rotated_residuals.par_chunks_mut(code_dim)) + .zip(ex_code_values.par_chunks_mut(code_dim)) + .zip(ex_res_dot_dists.par_iter_mut()) + .zip(values.par_chunks_exact(dim)) + .for_each_init( + || (), + |_, + ( + ((((code_dst, ex_dst), rotated_dst), ex_values_dst), ex_dot_dst), + input, + )| { + apply_fast_rotation(input, rotated_dst, signs); + pack_sign_bits(code_dst, rotated_dst); + *ex_dot_dst = + quantize_ex_code(rotated_dst, ex_bits, ex_dst, ex_values_dst); + }, + ); + } + } + + let binary_codes = UInt8Array::from(encoded_codes); + let ex_codes = UInt8Array::from(encoded_ex_codes); + Ok(RabitQuantizedBatch { + binary_codes: Arc::new(FixedSizeListArray::try_new_from_values( + binary_codes, + code_bytes as i32, + )?), + ex_codes: Some(Arc::new(FixedSizeListArray::try_new_from_values( + ex_codes, + ex_code_bytes as i32, + )?)), + ex_res_dot_dists: Some(ex_res_dot_dists), + rotated_residuals: Some(rotated_residuals), + ex_code_values: Some(ex_code_values), + }) + } } impl Quantization for RabitQuantizer { @@ -320,7 +635,7 @@ impl Quantization for RabitQuantizer { _: lance_linalg::distance::DistanceType, params: &Self::BuildParams, ) -> Result { - validate_supported_rq_num_bits(params.num_bits)?; + validate_rq_num_bits(params.num_bits)?; let dim = data.as_fixed_size_list().value_length() as usize; if !dim.is_multiple_of(u8::BITS as usize) { @@ -453,7 +768,7 @@ impl Quantization for RabitQuantizer { metadata: &Self::Metadata, _: lance_linalg::distance::DistanceType, ) -> Result { - validate_supported_rq_num_bits(metadata.num_bits)?; + validate_rq_num_bits(metadata.num_bits)?; Ok(Quantizer::Rabit(Self { metadata: metadata.clone(), })) @@ -464,7 +779,15 @@ impl Quantization for RabitQuantizer { } fn extra_fields(&self) -> Vec { - vec![ADD_FACTORS_FIELD.clone(), SCALE_FACTORS_FIELD.clone()] + let mut fields = vec![ADD_FACTORS_FIELD.clone(), SCALE_FACTORS_FIELD.clone()]; + if let Some(ex_code_field) = rabit_ex_code_field(self.code_dim(), self.metadata.num_bits) + .expect("RabitQ num_bits should be validated") + { + fields.push(ex_code_field); + fields.push(EX_ADD_FACTORS_FIELD.clone()); + fields.push(EX_SCALE_FACTORS_FIELD.clone()); + } + fields } } @@ -753,7 +1076,7 @@ mod tests { } #[test] - fn test_rabit_quantizer_rejects_unsupported_num_bits() { + fn test_rabit_quantizer_accepts_multi_bit_range() { let vectors = Float32Array::from(vec![0.0f32; 4 * 32]); let fsl = FixedSizeListArray::try_new_from_values(vectors, 32).unwrap(); @@ -765,16 +1088,31 @@ mod tests { err ); - let err = - RabitQuantizer::build(&fsl, DistanceType::L2, &RQBuildParams::new(2)).unwrap_err(); - assert!( - err.to_string().contains("only num_bits=1 is supported"), - "{}", - err - ); + for rotation_type in [RQRotationType::Fast, RQRotationType::Matrix] { + let quantizer = RabitQuantizer::build( + &fsl, + DistanceType::L2, + &RQBuildParams::with_rotation_type(9, rotation_type), + ) + .unwrap(); + let quantized = quantizer.quantize_split(&fsl).unwrap(); + assert!(quantized.ex_codes.is_some()); + assert_eq!( + quantized.binary_codes.as_fixed_size_list().value_length(), + 4 + ); + assert_eq!( + quantized + .ex_codes + .unwrap() + .as_fixed_size_list() + .value_length(), + 32 + ); + } let err = - RabitQuantizer::build(&fsl, DistanceType::L2, &RQBuildParams::new(9)).unwrap_err(); + RabitQuantizer::build(&fsl, DistanceType::L2, &RQBuildParams::new(10)).unwrap_err(); assert!( err.to_string().contains("IVF_RQ num_bits must be in"), "{}", diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index ca66a19d858..4c2aeb7363e 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -38,10 +38,12 @@ use serde::{Deserialize, Serialize}; use crate::frag_reuse::FragReuseIndex; use crate::pb; use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place}; -use crate::vector::bq::transform::{ADD_FACTORS_COLUMN, SCALE_FACTORS_COLUMN}; +use crate::vector::bq::transform::{ + ADD_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, SCALE_FACTORS_COLUMN, +}; use crate::vector::bq::{ RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rabit_ex_code_bytes, - validate_supported_rq_num_bits, + validate_rq_num_bits, }; use crate::vector::pq::storage::transpose; use crate::vector::quantizer::{QuantizerMetadata, QuantizerStorage}; @@ -49,7 +51,7 @@ use crate::vector::storage::{DistCalculator, QueryResidual, VectorStore}; pub const RABIT_METADATA_KEY: &str = "lance:rabit"; pub const RABIT_CODE_COLUMN: &str = "_rabit_codes"; -pub const RABIT_EX_CODE_COLUMN: &str = "_rabit_ex_codes"; +pub const RABIT_EX_CODE_COLUMN: &str = "__ex_codes"; pub const SEGMENT_LENGTH: usize = 4; pub const SEGMENT_NUM_CODES: usize = 1 << SEGMENT_LENGTH; @@ -959,7 +961,7 @@ impl QuantizerStorage for RabitQuantizationStorage { distance_type: DistanceType, _fri: Option>, ) -> Result { - validate_supported_rq_num_bits(metadata.num_bits)?; + validate_rq_num_bits(metadata.num_bits)?; let row_ids = batch[ROW_ID].as_primitive::().clone(); let codes = batch[RABIT_CODE_COLUMN].as_fixed_size_list().clone(); let expected_code_bytes = metadata.binary_code_bytes(); @@ -978,6 +980,48 @@ impl QuantizerStorage for RabitQuantizationStorage { let scale_factors = batch[SCALE_FACTORS_COLUMN] .as_primitive::() .clone(); + let ex_bits = rabit_ex_bits(metadata.num_bits)?; + if ex_bits != 0 { + let ex_codes = batch + .column_by_name(RABIT_EX_CODE_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!( + "RabitQ num_bits={} requires {} column", + metadata.num_bits, RABIT_EX_CODE_COLUMN + )) + })? + .as_fixed_size_list() + .clone(); + let expected_ex_code_bytes = rabit_ex_code_bytes(metadata.rotated_dim(), ex_bits)?; + if ex_codes.value_length() as usize != expected_ex_code_bytes { + return Err(Error::invalid_input(format!( + "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", + RABIT_EX_CODE_COLUMN, + ex_codes.value_length(), + metadata.rotated_dim(), + ex_bits, + expected_ex_code_bytes + ))); + } + batch + .column_by_name(EX_ADD_FACTORS_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!( + "RabitQ num_bits={} requires {} column", + metadata.num_bits, EX_ADD_FACTORS_COLUMN + )) + })? + .as_primitive::(); + batch + .column_by_name(EX_SCALE_FACTORS_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!( + "RabitQ num_bits={} requires {} column", + metadata.num_bits, EX_SCALE_FACTORS_COLUMN + )) + })? + .as_primitive::(); + } let (batch, codes) = if !metadata.packed { let codes = pack_codes(&codes); @@ -1058,14 +1102,20 @@ impl QuantizerStorage for RabitQuantizationStorage { .replace_column_by_name(RABIT_CODE_COLUMN, codes)? }; let codes = batch[RABIT_CODE_COLUMN].as_fixed_size_list().clone(); + let add_factors = batch[ADD_FACTORS_COLUMN] + .as_primitive::() + .clone(); + let scale_factors = batch[SCALE_FACTORS_COLUMN] + .as_primitive::() + .clone(); Ok(Self { metadata: self.metadata.clone(), distance_type: self.distance_type, batch, codes, - add_factors: self.add_factors.clone(), - scale_factors: self.scale_factors.clone(), + add_factors, + scale_factors, row_ids: new_row_ids, }) } @@ -1368,12 +1418,12 @@ mod tests { assert_eq!(*bin_code_bytes, 16); assert!(rabit_ex_code_field(128, 1).unwrap().is_none()); - let ex_field = rabit_ex_code_field(128, 4).unwrap().unwrap(); + let ex_field = rabit_ex_code_field(128, 9).unwrap().unwrap(); assert_eq!(ex_field.name(), RABIT_EX_CODE_COLUMN); let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { panic!("ex-code field should be FixedSizeList"); }; - assert_eq!(*ex_code_bytes, 48); + assert_eq!(*ex_code_bytes, 128); } fn make_test_codes(num_vectors: usize, code_dim: i32) -> FixedSizeListArray { @@ -1419,6 +1469,56 @@ mod tests { .unwrap() } + fn make_test_ex_codes(num_vectors: usize, code_dim: usize, num_bits: u8) -> FixedSizeListArray { + let ex_bits = rabit_ex_bits(num_bits).unwrap(); + let ex_code_bytes = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let values = (0..num_vectors * ex_code_bytes) + .map(|idx| (idx % 251) as u8) + .collect::>(); + FixedSizeListArray::try_new_from_values(UInt8Array::from(values), ex_code_bytes as i32) + .unwrap() + } + + fn make_test_batch_with_ex( + codes: FixedSizeListArray, + ex_codes: FixedSizeListArray, + ) -> RecordBatch { + let num_rows = codes.len(); + RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..num_rows as u64)) as ArrayRef, + ), + (RABIT_CODE_COLUMN, Arc::new(codes) as ArrayRef), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from_iter_values( + (0..num_rows).map(|v| v as f32), + )) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from_iter_values( + (0..num_rows).map(|v| v as f32 + 0.5), + )) as ArrayRef, + ), + (RABIT_EX_CODE_COLUMN, Arc::new(ex_codes) as ArrayRef), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from_iter_values( + (0..num_rows).map(|v| v as f32 + 10.5), + )) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from_iter_values( + (0..num_rows).map(|v| v as f32 + 1.5), + )) as ArrayRef, + ), + ]) + .unwrap() + } + fn assert_codes_eq(actual: &FixedSizeListArray, expected: &FixedSizeListArray) { assert_eq!(actual.len(), expected.len()); assert_eq!(actual.value_length(), expected.value_length()); @@ -1450,7 +1550,7 @@ mod tests { } #[test] - fn test_try_from_batch_rejects_unsupported_rq_num_bits() { + fn test_try_from_batch_requires_ex_columns_for_multi_bit_rq() { let original_codes = make_test_codes(50, 64); let mut metadata = make_test_metadata(original_codes.value_length() as usize * 8); metadata.num_bits = 2; @@ -1463,12 +1563,59 @@ mod tests { ) .unwrap_err(); assert!( - err.to_string().contains("only num_bits=1 is supported"), + err.to_string().contains("requires __ex_codes column"), + "{}", + err + ); + } + + #[test] + fn test_try_from_batch_requires_ex_add_factors_for_multi_bit_rq() { + let original_codes = make_test_codes(50, 64); + let code_dim = original_codes.value_length() as usize * 8; + let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9); + let mut metadata = make_test_metadata(code_dim); + metadata.num_bits = 9; + let batch = make_test_batch_with_ex(original_codes, ex_codes) + .drop_column(EX_ADD_FACTORS_COLUMN) + .unwrap(); + + let err = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap_err(); + assert!( + err.to_string().contains("requires __add_factors_ex column"), "{}", err ); } + #[test] + fn test_try_from_batch_accepts_multi_bit_rq_split_codes() { + let original_codes = make_test_codes(50, 64); + let code_dim = original_codes.value_length() as usize * 8; + let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9); + let mut metadata = make_test_metadata(code_dim); + metadata.num_bits = 9; + + let storage = RabitQuantizationStorage::try_from_batch( + make_test_batch_with_ex(original_codes, ex_codes), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(); + + assert!(storage.metadata().packed); + let stored_batch = storage.to_batches().unwrap().next().unwrap(); + assert_eq!( + stored_batch[RABIT_EX_CODE_COLUMN] + .as_fixed_size_list() + .value_length(), + 64 + ); + } + #[test] fn test_remap_preserves_packed_rq_storage_layout() { let original_codes = make_test_codes(50, 64); @@ -1502,4 +1649,54 @@ mod tests { let repacked = pack_codes(&unpack_codes(remapped_codes)); assert_codes_eq(remapped_codes, &repacked); } + + #[test] + fn test_remap_preserves_multi_bit_rq_split_columns() { + let original_codes = make_test_codes(50, 64); + let code_dim = original_codes.value_length() as usize * 8; + let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9); + let mut metadata = make_test_metadata(code_dim); + metadata.num_bits = 9; + let storage = RabitQuantizationStorage::try_from_batch( + make_test_batch_with_ex(original_codes.clone(), ex_codes), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(); + + let mut mapping = HashMap::new(); + mapping.insert(1, Some(101)); + mapping.insert(3, None); + mapping.insert(4, Some(104)); + + let remapped = storage.remap(&mapping).unwrap(); + let remapped_batch = remapped.to_batches().unwrap().next().unwrap(); + let remapped_row_ids = remapped_batch[ROW_ID].as_primitive::().values(); + let expected_row_ids = UInt64Array::from_iter_values( + [0, 101, 2, 104] + .into_iter() + .chain(5..original_codes.len() as u64), + ); + assert_eq!(remapped_row_ids, expected_row_ids.values()); + + assert_eq!( + remapped_batch[RABIT_EX_CODE_COLUMN] + .as_fixed_size_list() + .value_length(), + 64 + ); + assert_eq!( + &remapped_batch[EX_ADD_FACTORS_COLUMN] + .as_primitive::() + .values()[..5], + &[10.5, 11.5, 12.5, 14.5, 15.5] + ); + assert_eq!( + &remapped_batch[EX_SCALE_FACTORS_COLUMN] + .as_primitive::() + .values()[..5], + &[1.5, 2.5, 3.5, 5.5, 6.5] + ); + } } diff --git a/rust/lance-index/src/vector/bq/transform.rs b/rust/lance-index/src/vector/bq/transform.rs index af3e563c173..391f6ab158f 100644 --- a/rust/lance-index/src/vector/bq/transform.rs +++ b/rust/lance-index/src/vector/bq/transform.rs @@ -6,7 +6,9 @@ use std::sync::{Arc, LazyLock}; use arrow::array::AsArray; use arrow::datatypes::{Float16Type, Float32Type, Float64Type, UInt32Type}; -use arrow_array::{Array, ArrowNativeTypeOp, FixedSizeListArray, Float32Array, RecordBatch}; +use arrow_array::{ + Array, ArrowNativeTypeOp, FixedSizeListArray, Float32Array, RecordBatch, UInt32Array, +}; use arrow_schema::DataType; use lance_arrow::RecordBatchExt; use lance_core::{Error, Result}; @@ -14,7 +16,8 @@ use lance_linalg::distance::{DistanceType, norm_squared_fsl}; use tracing::instrument; use crate::vector::bq::builder::RabitQuantizer; -use crate::vector::bq::storage::RABIT_CODE_COLUMN; +use crate::vector::bq::rabit_ex_bits; +use crate::vector::bq::storage::{RABIT_CODE_COLUMN, RABIT_EX_CODE_COLUMN}; use crate::vector::quantizer::Quantization; use crate::vector::transform::Transformer; use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; @@ -47,6 +50,7 @@ pub struct RQTransformer { rq: RabitQuantizer, distance_type: DistanceType, centroids_norm_square: Option, + rotated_centroids: Option>, vector_column: String, } @@ -56,18 +60,162 @@ impl RQTransformer { distance_type: DistanceType, centroids: FixedSizeListArray, vector_column: impl Into, - ) -> Self { + ) -> Result { // for dot product, the add factor is `1 - v*c + |c|^2`, so we need to compute |c|^2 let centroids_norm_square = (distance_type == DistanceType::Dot) .then(|| Float32Array::from(norm_squared_fsl(¢roids))); + let rotated_centroids = (rq.num_bits() > 1) + .then(|| rq.rotate_fsl_to_f32(¢roids)) + .transpose()?; - Self { + Ok(Self { rq, distance_type, centroids_norm_square, + rotated_centroids, vector_column: vector_column.into(), + }) + } +} + +struct RabitRawQueryFactors { + add_factors: Float32Array, + scale_factors: Float32Array, + ex_add_factors: Float32Array, + ex_scale_factors: Float32Array, +} + +#[inline] +fn factor_ratio(numerator: f32, denominator: f32) -> f32 { + if denominator == 0.0 { + 0.0 + } else { + numerator / denominator + } +} + +#[inline] +fn binary_factor_value(rotated_residual: f32) -> f32 { + if rotated_residual.is_sign_positive() { + 0.5 + } else { + -0.5 + } +} + +#[allow(clippy::too_many_arguments)] +fn compute_raw_query_factors( + distance_type: DistanceType, + res_norm_square: &Float32Array, + rotated_residuals: &[f32], + rotated_centroids: &[f32], + part_ids: &UInt32Array, + ex_code_values: &[u8], + ex_res_dot_dists: &[f32], + ex_bits: u8, + code_dim: usize, +) -> Result { + if !matches!(distance_type, DistanceType::L2 | DistanceType::Dot) { + return Err(Error::index(format!( + "RQ Transform: distance type {} not supported", + distance_type + ))); + } + + let num_rows = res_norm_square.len(); + debug_assert_eq!(rotated_residuals.len(), num_rows * code_dim); + debug_assert_eq!(ex_code_values.len(), num_rows * code_dim); + debug_assert_eq!(ex_res_dot_dists.len(), num_rows); + + let ex_code_bias = -((1u32 << ex_bits) as f32 - 0.5); + let mut add_factors = Vec::with_capacity(num_rows); + let mut scale_factors = Vec::with_capacity(num_rows); + let mut ex_add_factors = Vec::with_capacity(num_rows); + let mut ex_scale_factors = Vec::with_capacity(num_rows); + + for (row_idx, (&norm_square, &ex_res_dot)) in res_norm_square + .values() + .iter() + .zip(ex_res_dot_dists.iter()) + .enumerate() + { + let part_id = part_ids.value(row_idx) as usize; + let centroid_start = part_id.checked_mul(code_dim).ok_or_else(|| { + Error::invalid_input(format!( + "RQ Transform: partition id {} overflows code_dim {}", + part_id, code_dim + )) + })?; + let centroid_end = centroid_start.checked_add(code_dim).ok_or_else(|| { + Error::invalid_input(format!( + "RQ Transform: partition id {} plus code_dim {} overflows", + part_id, code_dim + )) + })?; + if centroid_end > rotated_centroids.len() { + return Err(Error::invalid_input(format!( + "RQ Transform: partition id {} out of range for {} rotated centroids", + part_id, + rotated_centroids.len() / code_dim + ))); + } + + let row_start = row_idx * code_dim; + let row_end = row_start + code_dim; + let residual = &rotated_residuals[row_start..row_end]; + let centroid = &rotated_centroids[centroid_start..centroid_end]; + let ex_values = &ex_code_values[row_start..row_end]; + + let mut binary_res_dot = 0.0f32; + let mut binary_cent_dot = 0.0f32; + let mut ex_cent_dot = 0.0f32; + let mut residual_centroid_dot = 0.0f32; + for ((&residual_value, ¢roid_value), &ex_code_value) in + residual.iter().zip(centroid.iter()).zip(ex_values.iter()) + { + let residual_value: f32 = residual_value; + let centroid_value: f32 = centroid_value; + let binary_code = if residual_value.is_sign_positive() { + 1u32 + } else { + 0u32 + }; + let binary_factor = binary_factor_value(residual_value); + let ex_factor = ((binary_code << ex_bits) + ex_code_value as u32) as f32 + ex_code_bias; + + binary_res_dot += residual_value * binary_factor; + binary_cent_dot += centroid_value * binary_factor; + ex_cent_dot += centroid_value * ex_factor; + residual_centroid_dot += residual_value * centroid_value; + } + + let binary_correction = factor_ratio(norm_square * binary_cent_dot, binary_res_dot); + let ex_correction = factor_ratio(norm_square * ex_cent_dot, ex_res_dot); + + match distance_type { + DistanceType::L2 => { + add_factors.push(norm_square + 2.0 * binary_correction); + scale_factors.push(factor_ratio(-2.0 * norm_square, binary_res_dot)); + ex_add_factors.push(norm_square + 2.0 * ex_correction); + ex_scale_factors.push(factor_ratio(-2.0 * norm_square, ex_res_dot)); + } + DistanceType::Dot => { + let dot_base = 1.0 - residual_centroid_dot; + add_factors.push(dot_base + binary_correction); + scale_factors.push(factor_ratio(-norm_square, binary_res_dot)); + ex_add_factors.push(dot_base + ex_correction); + ex_scale_factors.push(factor_ratio(-norm_square, ex_res_dot)); + } + _ => unreachable!(), } } + + Ok(RabitRawQueryFactors { + add_factors: Float32Array::from(add_factors), + scale_factors: Float32Array::from(scale_factors), + ex_add_factors: Float32Array::from(ex_add_factors), + ex_scale_factors: Float32Array::from(ex_scale_factors), + }) } impl Debug for RQTransformer { @@ -79,7 +227,11 @@ impl Debug for RQTransformer { impl Transformer for RQTransformer { #[instrument(name = "RQTransformer::transform", level = "debug", skip_all)] fn transform(&self, batch: &RecordBatch) -> Result { - if batch.column_by_name(RABIT_CODE_COLUMN).is_some() { + let has_split_codes = self.rq.num_bits() == 1 + || (batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + && batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some() + && batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some()); + if batch.column_by_name(RABIT_CODE_COLUMN).is_some() && has_split_codes { return Ok(batch.clone()); } @@ -117,89 +269,157 @@ impl Transformer for RQTransformer { } }; - let rq_codes = self.rq.quantize(&residual_vectors)?; - let codes_fsl = rq_codes.as_fixed_size_list(); - - let ip_rq_res = match residual_vectors.value_type() { - DataType::Float16 => Float32Array::from( - self.rq - .codes_res_dot_dists::(residual_vectors)?, - ), - DataType::Float32 => Float32Array::from( - self.rq - .codes_res_dot_dists::(residual_vectors)?, - ), - DataType::Float64 => Float32Array::from( - self.rq - .codes_res_dot_dists::(residual_vectors)?, - ), - _ => { - return Err(Error::index(format!( - "RQ Transform: unsupported residual vector data type: {}", - residual_vectors.data_type() - ))); - } - }; + let rq_codes = self.rq.quantize_split(residual_vectors)?; + let codes_fsl = rq_codes.binary_codes.as_fixed_size_list(); debug_assert_eq!(codes_fsl.len(), batch.num_rows()); - let add_factors = match self.distance_type { - DistanceType::L2 => res_norm_square.clone(), - DistanceType::Dot => { - // for dot, the add factor is `1 - v*c + |c|^2 = dist_v_c + |c|^2` - let part_ids = &batch[PART_ID_COLUMN]; - let part_ids = part_ids.as_primitive::(); - let centroids_norm_square = self.centroids_norm_square.as_ref().ok_or( - Error::index("RQ Transform: centroids norm square not found".to_string()), - )?; - let centroids_norm_square = - arrow::compute::take(centroids_norm_square, part_ids, None)?; - let centroids_norm_square = centroids_norm_square.as_primitive::(); - Float32Array::from_iter_values( - dist_v_c - .values() - .iter() - .zip(centroids_norm_square.values().iter()) - .map(|(dist_v_c, centroids_norm_square)| dist_v_c + centroids_norm_square), - ) - } - _ => { - return Err(Error::index(format!( - "RQ Transform: distance type {} not supported", - self.distance_type - ))); - } - }; + let mut batch = batch.try_with_column(self.rq.field(), rq_codes.binary_codes)?; + if self.rq.num_bits() == 1 { + // Preserve the released 1-bit residual-query estimator and factor layout. + let ip_rq_res = match residual_vectors.value_type() { + DataType::Float16 => Float32Array::from( + self.rq + .codes_res_dot_dists::(residual_vectors)?, + ), + DataType::Float32 => Float32Array::from( + self.rq + .codes_res_dot_dists::(residual_vectors)?, + ), + DataType::Float64 => Float32Array::from( + self.rq + .codes_res_dot_dists::(residual_vectors)?, + ), + _ => { + return Err(Error::index(format!( + "RQ Transform: unsupported residual vector data type: {}", + residual_vectors.data_type() + ))); + } + }; + + let add_factors = match self.distance_type { + DistanceType::L2 => res_norm_square.clone(), + DistanceType::Dot => { + // for dot, the add factor is `1 - v*c + |c|^2 = dist_v_c + |c|^2` + let part_ids = &batch[PART_ID_COLUMN]; + let part_ids = part_ids.as_primitive::(); + let centroids_norm_square = self.centroids_norm_square.as_ref().ok_or( + Error::index("RQ Transform: centroids norm square not found".to_string()), + )?; + let centroids_norm_square = + arrow::compute::take(centroids_norm_square, part_ids, None)?; + let centroids_norm_square = centroids_norm_square.as_primitive::(); + Float32Array::from_iter_values( + dist_v_c + .values() + .iter() + .zip(centroids_norm_square.values().iter()) + .map(|(dist_v_c, centroids_norm_square)| { + dist_v_c + centroids_norm_square + }), + ) + } + _ => { + return Err(Error::index(format!( + "RQ Transform: distance type {} not supported", + self.distance_type + ))); + } + }; - let scale_factors = match self.distance_type { - DistanceType::L2 => Float32Array::from_iter_values( - res_norm_square.values().iter().zip(ip_rq_res.values()).map( - |(res_norm_square, ip_rq_res)| { - (-2.0 * res_norm_square) - .div_checked(*ip_rq_res) - .unwrap_or_default() - }, + let scale_factors = match self.distance_type { + DistanceType::L2 => Float32Array::from_iter_values( + res_norm_square.values().iter().zip(ip_rq_res.values()).map( + |(res_norm_square, ip_rq_res)| { + (-2.0 * res_norm_square) + .div_checked(*ip_rq_res) + .unwrap_or_default() + }, + ), ), - ), - DistanceType::Dot => Float32Array::from_iter_values( - res_norm_square.values().iter().zip(ip_rq_res.values()).map( - |(res_norm_square, ip_rq_res)| { - -res_norm_square.div_checked(*ip_rq_res).unwrap_or_default() - }, + DistanceType::Dot => Float32Array::from_iter_values( + res_norm_square.values().iter().zip(ip_rq_res.values()).map( + |(res_norm_square, ip_rq_res)| { + -res_norm_square.div_checked(*ip_rq_res).unwrap_or_default() + }, + ), ), - ), - _ => { - return Err(Error::index(format!( - "RQ Transform: distance type {} not supported", - self.distance_type - ))); - } - }; + _ => { + return Err(Error::index(format!( + "RQ Transform: distance type {} not supported", + self.distance_type + ))); + } + }; - let batch = batch.try_with_column(self.rq.field(), Arc::new(rq_codes))?; - let batch = batch - .try_with_column(ADD_FACTORS_FIELD.clone(), Arc::new(add_factors))? - .drop_column(CENTROID_DIST_COLUMN)?; - let batch = batch.try_with_column(SCALE_FACTORS_FIELD.clone(), Arc::new(scale_factors))?; + batch = batch + .try_with_column(ADD_FACTORS_FIELD.clone(), Arc::new(add_factors))? + .try_with_column(SCALE_FACTORS_FIELD.clone(), Arc::new(scale_factors))?; + } else { + // Multi-bit RQ is stored for the RaBitQ-Library raw-query estimator. + // Search remains gated until that query path lands. + let ex_codes = rq_codes.ex_codes.ok_or_else(|| { + Error::internal("RabitQ multi-bit quantization did not return ex codes".to_string()) + })?; + let ex_res_dot_dists = rq_codes.ex_res_dot_dists.ok_or_else(|| { + Error::internal( + "RabitQ multi-bit quantization did not return ex dot factors".to_string(), + ) + })?; + let rotated_residuals = rq_codes.rotated_residuals.ok_or_else(|| { + Error::internal( + "RabitQ multi-bit quantization did not return rotated residuals".to_string(), + ) + })?; + let ex_code_values = rq_codes.ex_code_values.ok_or_else(|| { + Error::internal( + "RabitQ multi-bit quantization did not return ex code values".to_string(), + ) + })?; + + let part_ids = batch[PART_ID_COLUMN].as_primitive::(); + let rotated_centroids = self.rotated_centroids.as_ref().ok_or_else(|| { + Error::internal("RabitQ multi-bit transformer is missing rotated centroids") + })?; + let ex_bits = rabit_ex_bits(self.rq.num_bits())?; + let raw_query_factors = compute_raw_query_factors( + self.distance_type, + &res_norm_square, + &rotated_residuals, + rotated_centroids, + part_ids, + &ex_code_values, + &ex_res_dot_dists, + ex_bits, + self.rq.dim(), + )?; + + batch = batch + .try_with_column( + ADD_FACTORS_FIELD.clone(), + Arc::new(raw_query_factors.add_factors), + )? + .try_with_column( + SCALE_FACTORS_FIELD.clone(), + Arc::new(raw_query_factors.scale_factors), + )? + .try_with_column( + crate::vector::bq::storage::rabit_ex_code_field( + self.rq.dim(), + self.rq.num_bits(), + )? + .expect("ex-code field should exist for num_bits > 1"), + ex_codes, + )? + .try_with_column( + EX_ADD_FACTORS_FIELD.clone(), + Arc::new(raw_query_factors.ex_add_factors), + )? + .try_with_column( + EX_SCALE_FACTORS_FIELD.clone(), + Arc::new(raw_query_factors.ex_scale_factors), + )?; + } let batch = batch .drop_column(&self.vector_column)? @@ -207,3 +427,185 @@ impl Transformer for RQTransformer { Ok(batch) } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::AsArray; + use arrow::datatypes::{Float32Type, UInt8Type}; + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, UInt32Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_linalg::distance::DistanceType; + + use crate::vector::bq::RQRotationType; + use crate::vector::bq::builder::RabitQuantizer; + use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::transform::Transformer; + use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; + + use super::{ + ADD_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, RQTransformer, + compute_raw_query_factors, + }; + + #[test] + fn test_rq_transformer_writes_multi_bit_ex_factors() { + let rq = RabitQuantizer::new_with_rotation::(4, 8, RQRotationType::Fast); + let centroids = + FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.0f32; 8]), 8) + .unwrap(); + let transformer = + RQTransformer::new(rq.clone(), DistanceType::L2, centroids, "vector").unwrap(); + assert!(transformer.rotated_centroids.is_some()); + + let residual_vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from(vec![ + 1.0, -2.0, 3.0, -4.0, 1.5, -2.5, 3.5, -4.5, 0.5, -1.0, 1.5, -2.0, 2.5, -3.0, 3.5, + -4.0, + ]), + 8, + ) + .unwrap(); + let res_norm_square = Float32Array::from(vec![73.0f32, 47.0]); + let batch = RecordBatch::try_from_iter(vec![ + ("vector", Arc::new(residual_vectors.clone()) as ArrayRef), + ( + PART_ID_COLUMN, + Arc::new(UInt32Array::from(vec![0, 0])) as ArrayRef, + ), + ( + CENTROID_DIST_COLUMN, + Arc::new(res_norm_square.clone()) as ArrayRef, + ), + ]) + .unwrap(); + + let transformed = transformer.transform(&batch).unwrap(); + assert!(transformed.column_by_name(RABIT_EX_CODE_COLUMN).is_some()); + assert_eq!( + transformed[RABIT_EX_CODE_COLUMN] + .as_fixed_size_list() + .value_length(), + 3 + ); + assert!( + transformed[RABIT_EX_CODE_COLUMN] + .as_fixed_size_list() + .values() + .as_primitive::() + .values() + .iter() + .any(|value| *value != 0) + ); + let expected_ex_dots = rq + .quantize_split(&residual_vectors) + .unwrap() + .ex_res_dot_dists + .unwrap(); + let ex_add_factors = transformed[EX_ADD_FACTORS_COLUMN].as_primitive::(); + assert_eq!(ex_add_factors.values(), res_norm_square.values()); + let ex_scale_factors = transformed[EX_SCALE_FACTORS_COLUMN].as_primitive::(); + for ((actual, norm), ex_dot) in ex_scale_factors + .values() + .iter() + .zip(res_norm_square.values()) + .zip(expected_ex_dots) + { + let expected = if ex_dot == 0.0 { + 0.0 + } else { + -2.0 * norm / ex_dot + }; + assert!((actual - expected).abs() < 1e-6); + } + assert!(transformed.column_by_name("vector").is_none()); + assert!(transformed.column_by_name(CENTROID_DIST_COLUMN).is_none()); + assert!(transformed.column_by_name(ADD_FACTORS_COLUMN).is_some()); + } + + #[test] + fn test_rq_transformer_caches_rotated_centroids_only_for_multi_bit() { + let centroids = + FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.0f32; 8]), 8) + .unwrap(); + let binary_rq = + RabitQuantizer::new_with_rotation::(1, 8, RQRotationType::Fast); + let binary_transformer = + RQTransformer::new(binary_rq, DistanceType::L2, centroids.clone(), "vector").unwrap(); + assert!(binary_transformer.rotated_centroids.is_none()); + + let multi_bit_rq = + RabitQuantizer::new_with_rotation::(4, 8, RQRotationType::Fast); + let multi_bit_transformer = + RQTransformer::new(multi_bit_rq, DistanceType::L2, centroids, "vector").unwrap(); + assert_eq!( + multi_bit_transformer + .rotated_centroids + .as_ref() + .unwrap() + .len(), + 8 + ); + } + + #[test] + fn test_raw_query_factors_match_reference_l2_formula() { + let res_norm_square = Float32Array::from(vec![5.0f32, 7.0]); + let rotated_residuals = vec![2.0, -1.0, 0.0, 0.0]; + let rotated_centroids = vec![3.0, 4.0]; + let part_ids = UInt32Array::from(vec![0, 0]); + let ex_code_values = vec![1, 0, 0, 0]; + let ex_res_dot_dists = vec![4.5, 0.0]; + + let factors = compute_raw_query_factors( + DistanceType::L2, + &res_norm_square, + &rotated_residuals, + &rotated_centroids, + &part_ids, + &ex_code_values, + &ex_res_dot_dists, + 1, + 2, + ) + .unwrap(); + + assert!((factors.add_factors.value(0) - 1.6666667).abs() < 1e-5); + assert!((factors.scale_factors.value(0) + 6.6666665).abs() < 1e-5); + assert!((factors.ex_add_factors.value(0) - 1.6666667).abs() < 1e-5); + assert!((factors.ex_scale_factors.value(0) + 2.2222223).abs() < 1e-5); + assert_eq!(factors.add_factors.value(1), 7.0); + assert_eq!(factors.scale_factors.value(1), 0.0); + assert_eq!(factors.ex_add_factors.value(1), 7.0); + assert_eq!(factors.ex_scale_factors.value(1), 0.0); + } + + #[test] + fn test_raw_query_factors_match_reference_dot_formula() { + let res_norm_square = Float32Array::from(vec![5.0f32]); + let rotated_residuals = vec![2.0, -1.0]; + let rotated_centroids = vec![3.0, 4.0]; + let part_ids = UInt32Array::from(vec![0]); + let ex_code_values = vec![1, 0]; + let ex_res_dot_dists = vec![4.5]; + + let factors = compute_raw_query_factors( + DistanceType::Dot, + &res_norm_square, + &rotated_residuals, + &rotated_centroids, + &part_ids, + &ex_code_values, + &ex_res_dot_dists, + 1, + 2, + ) + .unwrap(); + + assert!((factors.add_factors.value(0) + 2.6666667).abs() < 1e-5); + assert!((factors.scale_factors.value(0) + 3.3333333).abs() < 1e-5); + assert!((factors.ex_add_factors.value(0) + 2.6666667).abs() < 1e-5); + assert!((factors.ex_scale_factors.value(0) + 1.1111112).abs() < 1e-5); + } +} diff --git a/rust/lance-index/src/vector/distributed/index_merger.rs b/rust/lance-index/src/vector/distributed/index_merger.rs index 9753c8a88c8..e93984bbcca 100755 --- a/rust/lance-index/src/vector/distributed/index_merger.rs +++ b/rust/lance-index/src/vector/distributed/index_merger.rs @@ -21,10 +21,12 @@ use crate::IndexMetadata as IndexMetaSchema; use crate::pb; use crate::vector::bq::storage::{ RABIT_CODE_COLUMN, RABIT_METADATA_KEY, RabitQuantizationMetadata, pack_codes, - rabit_binary_code_field, + rabit_binary_code_field, rabit_ex_code_field, }; -use crate::vector::bq::transform::{ADD_FACTORS_FIELD, SCALE_FACTORS_FIELD}; -use crate::vector::bq::validate_supported_rq_num_bits; +use crate::vector::bq::transform::{ + ADD_FACTORS_FIELD, EX_ADD_FACTORS_FIELD, EX_SCALE_FACTORS_FIELD, SCALE_FACTORS_FIELD, +}; +use crate::vector::bq::validate_rq_num_bits; use crate::vector::flat::index::FlatMetadata; use crate::vector::ivf::storage::{IVF_METADATA_KEY, IvfModel as IvfStorageModel}; use crate::vector::pq::storage::{PQ_METADATA_KEY, ProductQuantizationMetadata, transpose}; @@ -299,12 +301,18 @@ pub async fn init_writer_for_rq( rq_meta: &RabitQuantizationMetadata, format_version: LanceFileVersion, ) -> Result { - let arrow_schema = ArrowSchema::new(vec![ + let mut fields = vec![ (*ROW_ID_FIELD).clone(), rabit_binary_code_field(rq_meta.rotated_dim()), ADD_FACTORS_FIELD.clone(), SCALE_FACTORS_FIELD.clone(), - ]); + ]; + if let Some(ex_code_field) = rabit_ex_code_field(rq_meta.rotated_dim(), rq_meta.num_bits)? { + fields.push(ex_code_field); + fields.push(EX_ADD_FACTORS_FIELD.clone()); + fields.push(EX_SCALE_FACTORS_FIELD.clone()); + } + let arrow_schema = ArrowSchema::new(fields); let writer = object_store.create(aux_out).await?; let mut w = FileWriter::try_new( writer, @@ -982,7 +990,7 @@ pub async fn merge_partial_vector_auxiliary_files( let rotate_mat_bytes = reader.read_global_buffer(buf_idx).await?; rq_meta_parsed.parse_buffer(rotate_mat_bytes)?; } - validate_supported_rq_num_bits(rq_meta_parsed.num_bits)?; + validate_rq_num_bits(rq_meta_parsed.num_bits)?; let d0 = rq_meta_parsed.rotated_dim(); if d0 == 0 { @@ -1523,6 +1531,8 @@ mod tests { use prost::Message; use crate::vector::bq::RQRotationType; + use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; lance_testing::define_stage_event_progress!( RecordingProgress, IndexBuildProgress, @@ -2053,7 +2063,14 @@ mod tests { distance_type: DistanceType, ) -> Result { let num_bytes = (metadata.code_dim as usize).div_ceil(u8::BITS as usize); - let arrow_schema = ArrowSchema::new(vec![ + let ex_code_field = rabit_ex_code_field(metadata.code_dim as usize, metadata.num_bits)?; + let ex_code_bytes = ex_code_field.as_ref().map(|field| { + let DataType::FixedSizeList(_, num_bytes) = field.data_type() else { + panic!("RQ ex-code field should be FixedSizeList"); + }; + *num_bytes as usize + }); + let mut fields = vec![ (*ROW_ID_FIELD).clone(), Field::new( RABIT_CODE_COLUMN, @@ -2065,7 +2082,13 @@ mod tests { ), ADD_FACTORS_FIELD.clone(), SCALE_FACTORS_FIELD.clone(), - ]); + ]; + if let Some(field) = ex_code_field { + fields.push(field); + fields.push(EX_ADD_FACTORS_FIELD.clone()); + fields.push(EX_SCALE_FACTORS_FIELD.clone()); + } + let arrow_schema = ArrowSchema::new(fields); let writer = store.create(aux_path).await?; let mut v2w = V2Writer::try_new( @@ -2094,6 +2117,10 @@ mod tests { let mut codes = Vec::with_capacity(total_rows * num_bytes); let mut add_factors = Vec::with_capacity(total_rows); let mut scale_factors = Vec::with_capacity(total_rows); + let mut ex_codes = + ex_code_bytes.map(|num_bytes| Vec::with_capacity(total_rows * num_bytes)); + let mut ex_add_factors = Vec::with_capacity(total_rows); + let mut ex_scale_factors = Vec::with_capacity(total_rows); let mut current_row_id = base_row_id; for (pid, len) in lengths.iter().enumerate() { @@ -2105,21 +2132,34 @@ mod tests { } add_factors.push(pid as f32 + row_offset as f32 * 0.1); scale_factors.push(pid as f32 + row_offset as f32 * 0.2); + if let (Some(ex_codes), Some(ex_code_bytes)) = (ex_codes.as_mut(), ex_code_bytes) { + for b in 0..ex_code_bytes { + ex_codes.push((17 + pid + row_offset + b) as u8); + } + ex_add_factors.push(pid as f32 + 10.0 + row_offset as f32 * 0.2); + ex_scale_factors.push(pid as f32 + 1.0 + row_offset as f32 * 0.2); + } } } - let batch = RecordBatch::try_new( - Arc::new(arrow_schema), - vec![ - Arc::new(UInt64Array::from(row_ids)), - Arc::new(FixedSizeListArray::try_new_from_values( - UInt8Array::from(codes), - num_bytes as i32, - )?), - Arc::new(Float32Array::from(add_factors)), - Arc::new(Float32Array::from(scale_factors)), - ], - )?; + let mut columns: Vec> = vec![ + Arc::new(UInt64Array::from(row_ids)), + Arc::new(FixedSizeListArray::try_new_from_values( + UInt8Array::from(codes), + num_bytes as i32, + )?), + Arc::new(Float32Array::from(add_factors)), + Arc::new(Float32Array::from(scale_factors)), + ]; + if let (Some(ex_codes), Some(ex_code_bytes)) = (ex_codes, ex_code_bytes) { + columns.push(Arc::new(FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes), + ex_code_bytes as i32, + )?)); + columns.push(Arc::new(Float32Array::from(ex_add_factors))); + columns.push(Arc::new(Float32Array::from(ex_scale_factors))); + } + let batch = RecordBatch::try_new(Arc::new(arrow_schema), columns)?; v2w.write_batch(&batch).await?; v2w.finish().await?; @@ -2391,6 +2431,118 @@ mod tests { assert_eq!(total_rows, expected_total); } + #[tokio::test] + async fn test_merge_ivf_rq_multi_bit_preserves_split_columns() { + let object_store = ObjectStore::memory(); + let index_dir = Path::from("index/uuid_rq_multi_bit"); + + let partial0 = index_dir.clone().join("partial_0"); + let partial1 = index_dir.clone().join("partial_1"); + let aux0 = partial0.clone().join(INDEX_AUXILIARY_FILE_NAME); + let aux1 = partial1.clone().join(INDEX_AUXILIARY_FILE_NAME); + + let lengths0 = vec![2_u32, 1_u32]; + let lengths1 = vec![1_u32, 2_u32]; + + let rq_meta = RabitQuantizationMetadata { + rotate_mat: None, + rotate_mat_position: None, + fast_rotation_signs: Some(vec![0xAA; 2]), + rotation_type: RQRotationType::Fast, + code_dim: 16, + num_bits: 4, + packed: false, + }; + + write_rq_partial_aux( + &object_store, + &aux0, + &rq_meta, + &lengths0, + 0, + DistanceType::L2, + ) + .await + .unwrap(); + write_rq_partial_aux( + &object_store, + &aux1, + &rq_meta, + &lengths1, + 1_000, + DistanceType::L2, + ) + .await + .unwrap(); + + merge_partial_vector_auxiliary_files( + &object_store, + &[aux0.clone(), aux1.clone()], + &index_dir, + crate::progress::noop_progress(), + ) + .await + .unwrap(); + + let aux_out = index_dir.clone().join(INDEX_AUXILIARY_FILE_NAME); + let sched = ScanScheduler::new( + Arc::new(object_store.clone()), + SchedulerConfig::max_bandwidth(&object_store), + ); + let fh = sched + .open_file(&aux_out, &CachedFileSize::unknown()) + .await + .unwrap(); + let reader = V2Reader::try_open( + fh, + None, + Arc::default(), + &lance_core::cache::LanceCache::no_cache(), + V2ReaderOptions::default(), + ) + .await + .unwrap(); + let meta = reader.metadata(); + let rq_meta_json = meta.file_schema.metadata.get(RABIT_METADATA_KEY).unwrap(); + let merged_rq_meta: RabitQuantizationMetadata = serde_json::from_str(rq_meta_json).unwrap(); + assert_eq!(merged_rq_meta.num_bits, 4); + assert!(merged_rq_meta.packed); + + let mut total_rows = 0usize; + let mut checked_split_columns = false; + let mut stream = reader + .read_stream( + lance_io::ReadBatchParams::RangeFull, + u32::MAX, + 4, + lance_encoding::decoder::FilterExpression::no_filter(), + ) + .await + .unwrap(); + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + if !checked_split_columns { + let schema = batch.schema(); + let ex_code_field = schema.field_with_name(RABIT_EX_CODE_COLUMN).unwrap(); + let DataType::FixedSizeList(_, ex_code_bytes) = ex_code_field.data_type() else { + panic!("RQ ex-code field should be FixedSizeList"); + }; + assert_eq!(*ex_code_bytes, 6); + assert!(schema.field_with_name(EX_ADD_FACTORS_COLUMN).is_ok()); + assert!(schema.field_with_name(EX_SCALE_FACTORS_COLUMN).is_ok()); + checked_split_columns = true; + } + total_rows += batch.num_rows(); + } + assert!(checked_split_columns); + let expected_total: usize = lengths0 + .iter() + .zip(lengths1.iter()) + .map(|(a, b)| (*a + *b) as usize) + .sum(); + assert_eq!(total_rows, expected_total); + } + #[tokio::test] async fn test_merge_ivf_pq_codebook_mismatch() { let object_store = ObjectStore::memory(); diff --git a/rust/lance-index/src/vector/ivf.rs b/rust/lance-index/src/vector/ivf.rs index 700c8f193d8..93436ee4dfd 100644 --- a/rust/lance-index/src/vector/ivf.rs +++ b/rust/lance-index/src/vector/ivf.rs @@ -79,13 +79,9 @@ pub fn new_ivf_transformer_with_quantizer( sq, range, )), - Quantizer::Rabit(rq) => Ok(IvfTransformer::with_rq( - centroids, - metric_type, - vector_column, - rq, - range, - )), + Quantizer::Rabit(rq) => { + IvfTransformer::with_rq(centroids, metric_type, vector_column, rq, range) + } } } @@ -284,7 +280,7 @@ impl IvfTransformer { vector_column: &str, rq: RabitQuantizer, range: Option>, - ) -> Self { + ) -> Result { let mut transforms: Vec> = vec![Arc::new(super::transform::Flatten::new(vector_column))]; @@ -322,9 +318,9 @@ impl IvfTransformer { distance_type, centroids.clone(), vector_column, - ))); + )?)); - Self::new(centroids, distance_type, transforms) + Ok(Self::new(centroids, distance_type, transforms)) } #[inline] diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 04c9d31a0e8..588c96ab781 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -29,7 +29,7 @@ use lance_index::metrics::NoOpMetricsCollector; use lance_index::optimize::OptimizeOptions; use lance_index::progress::{IndexBuildProgress, noop_progress}; use lance_index::vector::bq::builder::RabitQuantizer; -use lance_index::vector::bq::{RQBuildParams, RQRotationType}; +use lance_index::vector::bq::{RQBuildParams, RQRotationType, validate_supported_rq_num_bits}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::hnsw::HNSW; use lance_index::vector::ivf::builder::recommended_num_partitions; @@ -543,8 +543,21 @@ async fn prepare_vector_segment_build( ))); } - let num_rows = dataset.count_rows(None).await?; let index_type = params.index_type(); + if index_type == IndexType::IvfRq { + let Some(StageParams::RQ(rq_params)) = stages.last() else { + return Err(Error::index(format!( + "{mode}: invalid stages: {:?}", + stages + ))); + }; + // Multi-bit RQ quantization/storage internals are kept available for + // split-code preparation, but public index creation stays binary-only + // until multi-bit search support lands. + validate_supported_rq_num_bits(rq_params.num_bits)?; + } + + let num_rows = dataset.count_rows(None).await?; let num_partitions = ivf_params0.num_partitions.unwrap_or_else(|| { recommended_num_partitions( num_rows, diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 672264f0dac..b47b00d409c 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -550,6 +550,22 @@ impl DeepSizeOf for IVFIndex { } impl IVFIndex { + fn ensure_search_supported(&self) -> Result<()> { + if Q::quantization_type() == QuantizationType::Rabit { + let metadata = serde_json::to_value(self.storage.metadata())?; + let num_bits = metadata + .get("num_bits") + .and_then(|value| value.as_u64()) + .unwrap_or(1); + if num_bits > 1 { + return Err(Error::not_supported( + "IVF_RQ num_bits>1 search is not supported until split-code query support is implemented", + )); + } + } + Ok(()) + } + async fn prepare_partition( &self, partition_id: usize, @@ -1205,6 +1221,7 @@ impl VectorIndex for IVFInd pre_filter: Arc, metrics: &dyn MetricsCollector, ) -> Result { + self.ensure_search_supported()?; let part_entry = self.load_partition(partition_id, true, metrics).await?; pre_filter.wait_for_ready().await?; @@ -1257,6 +1274,7 @@ impl VectorIndex for IVFInd pre_filter: Arc, metrics: &dyn MetricsCollector, ) -> Result { + self.ensure_search_supported()?; Ok(Box::new( self.prepare_partition(partition_id, query, pre_filter, metrics) .await?, @@ -1268,6 +1286,7 @@ impl VectorIndex for IVFInd prepared: PreparedPartitionSearchHandle, metrics: &dyn MetricsCollector, ) -> Result { + self.ensure_search_supported()?; let prepared = prepared .downcast::>() .map_err(|_| Error::internal("failed to downcast prepared partition search"))?; @@ -1306,6 +1325,7 @@ impl VectorIndex for IVFInd control: Option>, metrics: Arc, ) -> Result { + self.ensure_search_supported()?; if partitions.len() != q_c_dists.len() { return Err(Error::invalid_input(format!( "partition count {} does not match centroid distance count {}", @@ -1675,7 +1695,9 @@ mod tests { use itertools::Itertools; use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::bq::{ - RQBuildParams, RQRotationType, storage::RabitQuantizationMetadata, + RQBuildParams, RQRotationType, + storage::{RABIT_EX_CODE_COLUMN, RabitQuantizationMetadata}, + transform::EX_SCALE_FACTORS_COLUMN, }; use lance_index::vector::storage::VectorStore; @@ -1694,7 +1716,7 @@ mod tests { }; use lance_core::cache::LanceCache; use lance_core::utils::tempfile::TempStrDir; - use lance_core::{ROW_ID, Result}; + use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::DecoderPlugins; use lance_file::reader::{FileReader, FileReaderOptions}; use lance_file::writer::FileWriter; @@ -1790,11 +1812,11 @@ mod tests { vectors } - async fn get_rq_metadata( + async fn open_rq_aux_reader( dataset: &Dataset, scheduler: Arc, index_uuid: &str, - ) -> RabitQuantizationMetadata { + ) -> FileReader { let index_path = dataset .indices_dir() .join(index_uuid) @@ -1803,7 +1825,7 @@ mod tests { .open_file(&index_path, &CachedFileSize::unknown()) .await .unwrap(); - let reader = FileReader::try_open( + FileReader::try_open( file_scheduler, None, Arc::::default(), @@ -1811,7 +1833,15 @@ mod tests { FileReaderOptions::default(), ) .await - .unwrap(); + .unwrap() + } + + async fn get_rq_metadata( + dataset: &Dataset, + scheduler: Arc, + index_uuid: &str, + ) -> RabitQuantizationMetadata { + let reader = open_rq_aux_reader(dataset, scheduler, index_uuid).await; let metadata = reader.schema().metadata.get(STORAGE_METADATA_KEY).unwrap(); let metadata_entries: Vec = serde_json::from_str(metadata).unwrap(); serde_json::from_str(&metadata_entries[0]).unwrap() @@ -3966,6 +3996,54 @@ mod tests { test_remap(params.clone(), nlist, recall_requirement).await; } + #[tokio::test] + #[ignore = "IVF_RQ num_bits>1 creation is gated until split-code search support is implemented"] + async fn test_build_ivf_rq_multi_bit_persists_split_codes_and_gates_search() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let (mut dataset, vectors) = generate_test_dataset::(test_uri, 0.0..1.0).await; + + let ivf_params = IvfBuildParams::new(4); + let rq_params = RQBuildParams::with_rotation_type(9, RQRotationType::Fast); + let params = VectorIndexParams::with_ivf_rq_params(DistanceType::L2, ivf_params, rq_params); + dataset + .create_index(&["vector"], IndexType::Vector, None, ¶ms, true) + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + let obj_store = Arc::new(ObjectStore::local()); + let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); + let index_uuid = indices[0].uuid.to_string(); + let rq_meta = get_rq_metadata(&dataset, scheduler.clone(), &index_uuid).await; + assert_eq!(rq_meta.num_bits, 9); + + let reader = open_rq_aux_reader(&dataset, scheduler, &index_uuid).await; + let schema = reader.schema(); + let ex_field = schema.field(RABIT_EX_CODE_COLUMN).unwrap(); + let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { + panic!("RQ ex-code field should be FixedSizeList"); + }; + assert_eq!(ex_code_bytes, 32); + assert!(schema.field(EX_SCALE_FACTORS_COLUMN).is_some()); + + let query = vectors.value(0); + let err = dataset + .scan() + .nearest("vector", query.as_primitive::(), 10) + .unwrap() + .try_into_batch() + .await + .unwrap_err(); + assert!(matches!(err, Error::Execution { .. }), "{err}"); + assert!( + err.to_string() + .contains("num_bits>1 search is not supported"), + "{err}" + ); + } + #[rstest] #[case::fast(RQRotationType::Fast)] #[case::matrix(RQRotationType::Matrix)] From fe875b70f5419967cc6e9765d25b3ca0d099cd13 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 5 Jun 2026 16:21:02 +0800 Subject: [PATCH 031/177] fix: compile AVX-512 dist table for target CPU (#7121) Fixes #7098. `dist_table.c` was compiled with `-march=native`, so non-AVX512 build hosts could skip the AVX512 object while other AVX512 kernel cfgs still enabled Rust references. This compiles the dist-table kernel for an explicit AVX512 target and splits the build cfgs per generated kernel family, so Rust only references C symbols that were actually built. The dist-table AVX512 dispatch now also requires `avx512bw` before using BW intrinsics. --- rust/lance-linalg/build.rs | 16 ++++++++------- rust/lance-linalg/src/distance/cosine.rs | 8 ++++---- rust/lance-linalg/src/distance/dot.rs | 8 ++++---- rust/lance-linalg/src/distance/l2.rs | 8 ++++---- rust/lance-linalg/src/distance/norm_l2.rs | 8 ++++---- rust/lance-linalg/src/simd/dist_table.rs | 24 +++++++++++++---------- 6 files changed, 39 insertions(+), 33 deletions(-) diff --git a/rust/lance-linalg/build.rs b/rust/lance-linalg/build.rs index 06e1439c77a..407f2a589ea 100644 --- a/rust/lance-linalg/build.rs +++ b/rust/lance-linalg/build.rs @@ -16,7 +16,9 @@ fn main() -> Result<(), String> { } // Let clippy know about our custom cfg attribute - println!("cargo::rustc-check-cfg=cfg(kernel_support, values(\"avx512\"))"); + println!( + "cargo::rustc-check-cfg=cfg(kernel_support, values(\"avx512_f16\", \"avx512_bf16\", \"avx512_dist_table\"))" + ); println!("cargo:rerun-if-changed=src/simd/f16.c"); println!("cargo:rerun-if-changed=src/simd/bf16.c"); @@ -58,10 +60,10 @@ fn main() -> Result<(), String> { "cargo:warning=Skipping build of AVX-512 fp16 kernels. Error: {}", err ); - } else { + } else if cfg!(feature = "fp16kernels") { // We create a special cfg so that we can detect we have in fact // generated the AVX512 version of the f16 kernels. - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + println!("cargo:rustc-cfg=kernel_support=\"avx512_f16\""); }; // Build AVX-512 bf16 kernels (sapphirerapids has native vdpbf16ps) if let Err(err) = @@ -71,16 +73,16 @@ fn main() -> Result<(), String> { "cargo:warning=Skipping build of AVX-512 bf16 kernels. Error: {}", err ); - } else { - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + } else if cfg!(feature = "fp16kernels") { + println!("cargo:rustc-cfg=kernel_support=\"avx512_bf16\""); }; - if let Err(err) = build_dist_table_with_flags("avx512", &["-march=native"]) { + if let Err(err) = build_dist_table_with_flags("avx512", &["-march=sapphirerapids"]) { println!( "cargo:warning=Skipping build of AVX-512 dist_table. Error: {}", err ); } else { - println!("cargo:rustc-cfg=kernel_support=\"avx512\""); + println!("cargo:rustc-cfg=kernel_support=\"avx512_dist_table\""); }; // Build a version with AVX // While GCC doesn't have support for _Float16 until GCC 12, clang diff --git a/rust/lance-linalg/src/distance/cosine.rs b/rust/lance-linalg/src/distance/cosine.rs index be5bf436344..995191b77eb 100644 --- a/rust/lance-linalg/src/distance/cosine.rs +++ b/rust/lance-linalg/src/distance/cosine.rs @@ -82,7 +82,7 @@ mod bf16_kernel { #[cfg(target_arch = "aarch64")] pub fn cosine_bf16_neon(x: *const bf16, x_norm: f32, y: *const bf16, dimension: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn cosine_bf16_avx512( x: *const bf16, x_norm: f32, @@ -109,7 +109,7 @@ impl Cosine for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -141,7 +141,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn cosine_f16_neon(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn cosine_f16_avx512(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn cosine_f16_avx2(x: *const f16, x_norm: f32, y: *const f16, dimension: u32) -> f32; @@ -161,7 +161,7 @@ impl Cosine for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs index cf045b1996a..5903d24e0e5 100644 --- a/rust/lance-linalg/src/distance/dot.rs +++ b/rust/lance-linalg/src/distance/dot.rs @@ -122,7 +122,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn dot_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn dot_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn dot_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; @@ -143,7 +143,7 @@ impl Dot for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -175,7 +175,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn dot_f16_neon(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn dot_f16_avx512(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn dot_f16_avx2(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; @@ -196,7 +196,7 @@ impl Dot for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs index 036b54de8d1..c830d103df4 100644 --- a/rust/lance-linalg/src/distance/l2.rs +++ b/rust/lance-linalg/src/distance/l2.rs @@ -152,7 +152,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn l2_bf16_neon(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn l2_bf16_avx512(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn l2_bf16_avx2(ptr1: *const bf16, ptr2: *const bf16, len: u32) -> f32; @@ -173,7 +173,7 @@ impl L2 for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -205,7 +205,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn l2_f16_neon(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn l2_f16_avx512(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn l2_f16_avx2(ptr1: *const f16, ptr2: *const f16, len: u32) -> f32; @@ -226,7 +226,7 @@ impl L2 for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/distance/norm_l2.rs b/rust/lance-linalg/src/distance/norm_l2.rs index cd604ac8c05..b1daf85ab3b 100644 --- a/rust/lance-linalg/src/distance/norm_l2.rs +++ b/rust/lance-linalg/src/distance/norm_l2.rs @@ -29,7 +29,7 @@ mod kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn norm_l2_f16_neon(ptr: *const f16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_f16", target_arch = "x86_64"))] pub fn norm_l2_f16_avx512(ptr: *const f16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn norm_l2_f16_avx2(ptr: *const f16, len: u32) -> f32; @@ -57,7 +57,7 @@ impl Normalize for f16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_f16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { @@ -87,7 +87,7 @@ mod bf16_kernel { unsafe extern "C" { #[cfg(target_arch = "aarch64")] pub fn norm_l2_bf16_neon(ptr: *const bf16, len: u32) -> f32; - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_bf16", target_arch = "x86_64"))] pub fn norm_l2_bf16_avx512(ptr: *const bf16, len: u32) -> f32; #[cfg(target_arch = "x86_64")] pub fn norm_l2_bf16_avx2(ptr: *const bf16, len: u32) -> f32; @@ -108,7 +108,7 @@ impl Normalize for bf16 { }, #[cfg(all( feature = "fp16kernels", - kernel_support = "avx512", + kernel_support = "avx512_bf16", target_arch = "x86_64" ))] SimdSupport::Avx512FP16 => unsafe { diff --git a/rust/lance-linalg/src/simd/dist_table.rs b/rust/lance-linalg/src/simd/dist_table.rs index 66c30c75427..bfc05fc2f26 100644 --- a/rust/lance-linalg/src/simd/dist_table.rs +++ b/rust/lance-linalg/src/simd/dist_table.rs @@ -37,18 +37,22 @@ pub fn sum_4bit_dist_table( debug_assert!(n.is_multiple_of(BATCH_SIZE)); match *SIMD_SUPPORT { - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] - SimdSupport::Avx512 | SimdSupport::Avx512FP16 => unsafe { + #[cfg(all(kernel_support = "avx512_dist_table", target_arch = "x86_64"))] + SimdSupport::Avx512 | SimdSupport::Avx512FP16 + if std::arch::is_x86_feature_detected!("avx512bw") => + { for i in (0..n).step_by(BATCH_SIZE) { let codes = &codes[i * code_len..(i + BATCH_SIZE) * code_len]; - sum_4bit_dist_table_32bytes_batch_avx512( - codes.as_ptr(), - codes.len(), - dist_table.as_ptr(), - dists[i..i + BATCH_SIZE].as_mut_ptr(), - ) + unsafe { + sum_4bit_dist_table_32bytes_batch_avx512( + codes.as_ptr(), + codes.len(), + dist_table.as_ptr(), + dists[i..i + BATCH_SIZE].as_mut_ptr(), + ) + } } - }, + } #[cfg(target_arch = "x86_64")] SimdSupport::Avx2 => unsafe { for i in (0..n).step_by(BATCH_SIZE) { @@ -253,7 +257,7 @@ unsafe fn sum_dist_table_32bytes_batch_neon(codes: &[u8], dist_table: &[u8], dis // We implement the AVX512 version in C because AVX512 is not stable yet in Rust, // implement it in Rust once we upgrade rust to 1.89.0. unsafe extern "C" { - #[cfg(all(kernel_support = "avx512", target_arch = "x86_64"))] + #[cfg(all(kernel_support = "avx512_dist_table", target_arch = "x86_64"))] pub fn sum_4bit_dist_table_32bytes_batch_avx512( codes: *const u8, code_length: usize, From 60d061bfeaf45ac04258a3ab92b680ea364236c5 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 5 Jun 2026 16:36:52 +0800 Subject: [PATCH 032/177] test(index): cover flat bm25 doc length normalization (#7120) Adds a regression test for OSS-1183 to ensure flat BM25 scoring normalizes by the full document token count, not just the query-matching token count. The test covers two documents with identical term frequency but different lengths, requiring the shorter document to receive the higher score. Context: https://github.com/lance-format/lance/issues/7107 --- rust/lance-index/src/scalar/inverted/index.rs | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index a3e91f0f4c7..a6e94cd1a93 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -7051,6 +7051,57 @@ mod tests { ); } + #[tokio::test] + async fn flat_bm25_search_uses_full_document_length_for_normalization() { + let schema = Arc::new(Schema::new(vec![ + ROW_ID_FIELD.clone(), + Field::new("text", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![0u64, 1])), + Arc::new(StringArray::from(vec![ + "alpha", + "alpha filler filler filler filler filler filler filler filler filler", + ])), + ], + ) + .unwrap(); + + let input: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema.clone(), + stream::iter(vec![Ok(batch)]), + )); + let tokenizer: Box = Box::new(TextTokenizer::new( + TextAnalyzer::builder(SimpleTokenizer::default()).build(), + )); + + let result_stream = flat_bm25_search_stream_with_metrics( + input, + "text".to_string(), + "alpha".to_string(), + tokenizer, + None, + 100, + None, + ) + .await + .unwrap(); + let batches: Vec<_> = result_stream.try_collect().await.unwrap(); + let scored = arrow::compute::concat_batches(&FTS_SCHEMA, &batches).unwrap(); + let row_ids = scored[ROW_ID].as_primitive::(); + let scores = scored[SCORE_COL].as_primitive::(); + + assert_eq!(row_ids.values(), &[0, 1]); + assert!( + scores.value(0) > scores.value(1), + "same term frequency should score shorter document higher; short={}, long={}", + scores.value(0), + scores.value(1) + ); + } + /// An [`IndexReader`] wrapper that hides the posting-group-offsets schema /// metadata key, so a [`PostingListReader`] opened on it takes the /// pre-grouping per-token fallback path (issue #7040). From 58bac59268ce2b0434de9c4aaa46e3a07c4cf48f Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 5 Jun 2026 16:37:08 +0800 Subject: [PATCH 033/177] fix(namespace): tolerate reqwest client model additions (#7124) The no-lock Rust workflow can resolve `lance-namespace-reqwest-client` 0.8.2, whose generated models added optional `branch` fields. This PR makes the namespace implementations initialize those request/response models with defaults so patch-level generated model additions do not break no-lock builds. Context: `build-no-lock` failed on #7120 while compiling `lance-namespace-impls`, independent of that PR's BM25 test-only diff. --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- python/Cargo.lock | 4 ++-- rust/lance-index/src/vector/utils.rs | 5 +++-- rust/lance-namespace-impls/src/dir.rs | 4 +--- rust/lance-namespace-impls/src/rest_adapter.rs | 8 +------- 6 files changed, 10 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b78c27cb2bc..028bfc79cb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5024,9 +5024,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" dependencies = [ "reqwest 0.12.28", "serde", diff --git a/Cargo.toml b/Cargo.toml index fb49e862f58..3140866590b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,7 +70,7 @@ lance-linalg = { version = "=8.0.0-beta.3", path = "./rust/lance-linalg" } lance-namespace = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = "0.8.0" +lance-namespace-reqwest-client = "0.8.2" lance-select = { version = "=8.0.0-beta.3", path = "./rust/lance-select" } lance-tokenizer = { version = "=8.0.0-beta.3", path = "./rust/lance-tokenizer" } lance-table = { version = "=8.0.0-beta.3", path = "./rust/lance-table" } diff --git a/python/Cargo.lock b/python/Cargo.lock index 47bd677dbfa..9a900dbd26b 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4549,9 +4549,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" dependencies = [ "reqwest 0.12.28", "serde", diff --git a/rust/lance-index/src/vector/utils.rs b/rust/lance-index/src/vector/utils.rs index 8307bd9edff..fb4f9004c57 100644 --- a/rust/lance-index/src/vector/utils.rs +++ b/rust/lance-index/src/vector/utils.rs @@ -305,12 +305,13 @@ mod tests { )) as ArrayRef, 42.0f32)] #[case::f32(Arc::new(Float32Array::from( (0..100).flat_map(|i| std::iter::repeat_n(i as f32, 16)).collect::>(), - )) as ArrayRef, 42.1f32)] + )) as ArrayRef, 42.0f32)] fn test_simple_index_nearest_centroid(#[case] centroids: ArrayRef, #[case] query_val: f32) { let index = build_index(centroids, 16); let query: ArrayRef = Arc::new(Float32Array::from(vec![query_val; 16])); - let (id, _) = index.search(query).unwrap(); + let (id, dist) = index.search(query).unwrap(); assert_eq!(id, 42); + assert_eq!(dist, 0.0); } #[test] diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fb0b03ad239..a8eb1416183 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -4168,9 +4168,7 @@ impl LanceNamespace for DirectoryNamespace { .await .map_err(|e| Self::map_tag_error(e, &request.tag, &table_uri))?; - Ok(GetTableTagVersionResponse { - version: version as i64, - }) + Ok(GetTableTagVersionResponse::new(version as i64)) } async fn create_table_tag( diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index f7de5dc2240..6a3875ebf29 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -3095,13 +3095,7 @@ mod tests { "context_test_ns".to_string(), "test_table".to_string(), ]), - with_table_uri: None, - load_detailed_metadata: None, - check_declared: None, - vend_credentials: None, - version: None, - identity: None, - context: None, + ..Default::default() }; let result = namespace.describe_table(describe_req).await; assert!(result.is_ok(), "Failed to describe table: {:?}", result); From c7ae55a6be95ffc2a85711de7f54eed80a0eab05 Mon Sep 17 00:00:00 2001 From: YueZhang <69956021+zhangyue19921010@users.noreply.github.com> Date: Fri, 5 Jun 2026 17:09:26 +0800 Subject: [PATCH 034/177] feat: add segmented BTree index merge_segments support (#6889) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes: https://github.com/lance-format/lance/issues/6979 Also refactored scalar index optimize logic in `append.rs`: 1. **Unified `merge_scalar_indices`** — consolidates all scalar-index merge logic from `merge_indices_with_unindexed_frags` into one function with explicit segment selection and a single capability-based decision tree. 2. **Added `BTreeIndex::merge_segments`** — a k-way segment merge primitive, enabling multi-segment consolidation without re-reading the dataset. 3. **`num_indices_to_merge` is now honored uniformly for all scalar types** — previously scalar indices ignored it and always merged into one segment. Now `append` (=`0`) builds a delta only for the unindexed fragments, `merge_N` merges the latest N segments, and `default`/`merge_1` merges the newest delta — matching the inverted/vector semantics. 4. **Multi-segment handling is capability-based, never a hard error** — `merge_scalar_indices` picks per case: 1. **Append delta** (no segment selected) → build a new segment over the unindexed fragments, keep the old ones. 2. **Supports N:1 merge (like BTree), ≥1 selected** → k-way merge into one segment. 3. **No N:1 merge, exactly 1 selected** → single-segment `update()`. 4. **No N:1 merge, ≥2 selected** → rebuild a fresh segment over the covered fragments (no k-way primitive, and lossy summary indexes can't re-emit rows). ### Compatibility Testing #### Context Main branch **ignores** `num_indices_to_merge` for scalar indices — every mode merges the single old segment + new data → 1 segment. A multi-segment scalar index is a state that **cannot exist on main**. #### Test Results (single-segment starting state — the only state main can produce) Behavior is identical across all scalar types here (capability only matters once ≥2 segments exist): | Mode | Main | PR | Diff | |------|------|-----|------| | default / merge_1 / merge_2 | 1 seg (param ignored) | 1 seg (merge newest + new) | ✅ Same result | | **append** | **1 seg (param ignored)** | **2 seg (delta only)** | **⚠️ Changed** | > The only behavior change is `append` mode: it now produces a delta segment instead of silently merging. All other modes are unchanged from main. #### PR-only new behavior: multi-segment scenarios (unreachable on main) After accumulating N segments via repeated append-mode optimizes (applies to all scalar types): | Starting state | Mode | Result | |----------------|------|--------| | 3 segments + new data | default / merge_1 | 3−1+1 = **3** segments (merge newest delta) | | 3 segments + new data | merge_2 | 3−2+1 = **2** segments | | 3 segments + new data | append | 3+1 = **4** segments | > For `merge_2` (≥2 selected): types that **support N:1 merge** (like BTree) use the k-way merge primitive; types that **don't** consolidate via a full rebuild over the covered fragments. Both reach the same segment count. **Query correctness**: All combinations verified — no data loss or incorrect results. Added tests for the delta-append, single-segment `update`, and multi-segment rebuild paths. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- java/src/test/java/org/lance/DatasetTest.java | 24 +- rust/lance-index/src/scalar/btree.rs | 104 ++- rust/lance/src/dataset.rs | 3 +- rust/lance/src/index.rs | 21 +- rust/lance/src/index/append.rs | 689 +++++++++++++++--- rust/lance/src/index/create.rs | 248 +++++-- rust/lance/src/index/scalar.rs | 1 + rust/lance/src/index/scalar/btree.rs | 193 +++++ 8 files changed, 1085 insertions(+), 198 deletions(-) create mode 100644 rust/lance/src/index/scalar/btree.rs diff --git a/java/src/test/java/org/lance/DatasetTest.java b/java/src/test/java/org/lance/DatasetTest.java index 3ea6a0812e1..45466a0367c 100644 --- a/java/src/test/java/org/lance/DatasetTest.java +++ b/java/src/test/java/org/lance/DatasetTest.java @@ -1993,18 +1993,20 @@ void testOptimizingIndices(@TempDir Path tempDir) throws Exception { OptimizeOptions options = OptimizeOptions.builder().numIndicesToMerge(0).build(); dsAppended.optimizeIndices(options); - List afterIndexes = dsAppended.getIndexes(); - Index idIndexAfter = - afterIndexes.stream() + List idIndexes = + dsAppended.getIndexes().stream() .filter(idx -> "id_idx".equals(idx.name())) - .findFirst() - .orElse(null); - assertNotNull(idIndexAfter); - List afterFragments = idIndexAfter.fragments().orElse(Collections.emptyList()); - - assertTrue(afterFragments.contains(0)); - assertTrue(afterFragments.contains(1)); - assertEquals(2, afterFragments.size()); + .collect(Collectors.toList()); + assertEquals( + 2, + idIndexes.size(), + "append-only optimize must add a delta segment instead of merging"); + + Set coveredFragments = + idIndexes.stream() + .flatMap(idx -> idx.fragments().orElse(Collections.emptyList()).stream()) + .collect(Collectors.toSet()); + assertEquals(new HashSet<>(Arrays.asList(0, 1)), coveredFragments); } } } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index ba6d3dd142d..a650e8db244 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -68,7 +68,7 @@ use tracing::{info, instrument}; mod flat; -const BTREE_LOOKUP_NAME: &str = "page_lookup.lance"; +pub const BTREE_LOOKUP_NAME: &str = "page_lookup.lance"; const BTREE_PAGES_NAME: &str = "page_data.lance"; pub const DEFAULT_BTREE_BATCH_SIZE: u64 = 4096; const BATCH_SIZE_META_KEY: &str = "batch_size"; @@ -1489,7 +1489,7 @@ impl BTreeIndex { } /// Create a stream of all the data in the index, in the same format used to train the index - async fn into_data_stream(self) -> Result { + async fn data_stream(&self) -> Result { let lazy_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let reader = lazy_reader.get().await?; let new_schema = Arc::new(self.train_schema()); @@ -1512,25 +1512,51 @@ impl BTreeIndex { ))) } - async fn combine_old_new( - self, + /// Merge N source BTree segments plus an additional `new_data` stream into + /// a single BTree under `dest_store`, without re-reading the dataset. + pub async fn merge_segments( + segments: &[Arc], new_data: SendableRecordBatchStream, - chunk_size: u64, + dest_store: &dyn IndexStore, old_data_filter: Option, - ) -> Result { - let value_column_index = new_data.schema().index_of(VALUE_COLUMN_NAME)?; - - let new_input = Arc::new(OneShotExec::new(new_data)); - let old_stream = self.into_data_stream().await?; - let old_stream = match old_data_filter { - Some(filter) => filter_row_ids(old_stream, filter), - None => old_stream, + ) -> Result { + let Some(first) = segments.first() else { + return Err(Error::invalid_input( + "cannot merge BTree index without at least one source segment".to_string(), + )); }; - let old_input = Arc::new(OneShotExec::new(old_stream)); - debug_assert_eq!( - old_input.schema().flattened_fields().len(), - new_input.schema().flattened_fields().len() - ); + + for segment in segments.iter().skip(1) { + if segment.data_type != first.data_type { + return Err(Error::index(format!( + "cannot merge BTree segments with different value types ({:?} vs {:?})", + first.data_type, segment.data_type + ))); + } + } + + let new_schema = new_data.schema(); + let value_column_index = new_schema.index_of(VALUE_COLUMN_NAME)?; + let new_value_type = new_schema.field(value_column_index).data_type(); + if new_value_type != &first.data_type { + return Err(Error::invalid_input(format!( + "BTree merge: new_data value column type {:?} does not match \ + segment value type {:?}", + new_value_type, first.data_type + ))); + } + + let mut inputs: Vec> = Vec::with_capacity(segments.len() + 1); + for segment in segments { + let stream = segment.data_stream().await?; + let stream = match old_data_filter.clone() { + Some(filter) => filter_row_ids(stream, filter), + None => stream, + }; + let exec = Arc::new(OneShotExec::new(stream)); + inputs.push(exec); + } + inputs.push(Arc::new(OneShotExec::new(new_data))); let sort_expr = PhysicalSortExpr { expr: Arc::new(Column::new(VALUE_COLUMN_NAME, value_column_index)), @@ -1539,11 +1565,10 @@ impl BTreeIndex { nulls_first: true, }, }; - // The UnionExec creates multiple partitions but the SortPreservingMergeExec merges - // them back into a single partition. - let all_data = UnionExec::try_new(vec![old_input, new_input])?; - let ordered = Arc::new(SortPreservingMergeExec::new([sort_expr].into(), all_data)); - + // UnionExec yields multiple partitions; SortPreservingMergeExec merges + // them back into a single partition while preserving value-ordering. + let unioned = UnionExec::try_new(inputs)?; + let ordered = Arc::new(SortPreservingMergeExec::new([sort_expr].into(), unioned)); let unchunked = execute_plan( ordered, LanceExecutionOptions { @@ -1551,7 +1576,16 @@ impl BTreeIndex { ..Default::default() }, )?; - Ok(chunk_concat_stream(unchunked, chunk_size as usize)) + let merged_stream = chunk_concat_stream(unchunked, first.batch_size as usize); + + train_btree_index(merged_stream, dest_store, first.batch_size, None, None).await?; + + Ok(CreatedIndex { + index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) + .unwrap(), + index_version: BTREE_INDEX_VERSION, + files: Some(dest_store.list_files_with_sizes().await?), + }) } } @@ -1896,19 +1930,15 @@ impl ScalarIndex for BTreeIndex { dest_store: &dyn IndexStore, old_data_filter: Option, ) -> Result { - // Merge the existing index data with the new data and then retrain the index on the merged stream - let merged_data_source = self - .clone() - .combine_old_new(new_data, self.batch_size, old_data_filter) - .await?; - train_btree_index(merged_data_source, dest_store, self.batch_size, None, None).await?; - - Ok(CreatedIndex { - index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) - .unwrap(), - index_version: BTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), - }) + // Updating is the single-segment case of a segment merge: union this + // index's data with `new_data`, re-sort on value, and retrain. + Self::merge_segments( + &[Arc::new(self.clone())], + new_data, + dest_store, + old_data_filter, + ) + .await } fn update_criteria(&self) -> UpdateCriteria { diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index fdb18398e6e..e3f7a197e58 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -3054,7 +3054,8 @@ impl Dataset { IndexType::BTree => { Err(Error::invalid_input( "BTree distributed indexing no longer supports merge_index_metadata; \ - build segments, and commit with commit_existing_index_segments(...)" + build segments, optionally merge groups with merge_existing_index_segments(...), \ + and commit with commit_existing_index_segments(...)" .to_string(), )) } diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 81b0ddee407..3a5f9975810 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -47,6 +47,7 @@ use lance_index::{INDEX_FILE_NAME, Index, IndexType, PrewarmOptions, pb, vector: use lance_index::{ IndexCriteria, is_system_index, metrics::{MetricsCollector, NoOpMetricsCollector}, + scalar::btree::BTREE_LOOKUP_NAME, }; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::traits::Reader; @@ -257,6 +258,19 @@ fn segment_has_bitmap_details(segment: &IndexMetadata) -> bool { .is_some_and(|details| details.type_url.ends_with("BitmapIndexDetails")) } +/// Detect BTree segments, preserving a legacy pre-details fallback. +fn segment_has_btree_details(segment: &IndexMetadata) -> bool { + segment.index_details.as_ref().map_or_else( + || { + segment + .files + .as_ref() + .is_some_and(|files| files.iter().any(|file| file.path == BTREE_LOOKUP_NAME)) + }, + |details| details.type_url.ends_with("BTreeIndexDetails"), + ) +} + // Cache keys for different index types #[derive(Debug, Clone)] pub(crate) struct LegacyVectorIndexCacheKey<'a> { @@ -1112,7 +1126,8 @@ impl DatasetIndexExt for Dataset { let all_vector = source_segments.iter().all(segment_has_vector_details); let all_inverted = source_segments.iter().all(segment_has_inverted_details); let all_bitmap = source_segments.iter().all(segment_has_bitmap_details); - if !all_vector && !all_inverted && !all_bitmap { + let all_btree = source_segments.iter().all(segment_has_btree_details); + if !all_vector && !all_inverted && !all_bitmap && !all_btree { return Err(Error::invalid_input( "merge_existing_index_segments requires all segments to have the same supported index type" .to_string(), @@ -1128,8 +1143,10 @@ impl DatasetIndexExt for Dataset { .await? } else if all_inverted { crate::index::scalar::inverted::merge_segments(self, source_segments).await? - } else { + } else if all_bitmap { crate::index::scalar::bitmap::merge_segments(self, source_segments).await? + } else { + crate::index::scalar::btree::merge_segments(self, source_segments).await? }; merged_segment.dataset_version = self.manifest.version; merged_segment.fields = vec![field_id]; diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 4398928d3e2..a89b64df276 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -11,7 +11,8 @@ use lance_index::{ optimize::OptimizeOptions, progress::NoopIndexBuildProgress, scalar::{ - CreatedIndex, OldIndexDataFilter, inverted::InvertedIndex, lance_format::LanceIndexStore, + CreatedIndex, OldIndexDataFilter, ScalarIndex, inverted::InvertedIndex, + lance_format::LanceIndexStore, }, }; use lance_select::{RowAddrTreeMap, RowSetOps}; @@ -74,6 +75,203 @@ async fn build_stable_row_id_filter( Ok(::union_all(&row_id_map_refs)) } +/// Build the [`OldIndexDataFilter`] that must be applied to existing index +/// rows when their owning fragments have been pruned by compaction or +/// deletions. +pub async fn build_old_data_filter( + dataset: &Dataset, + effective_old_frags: &RoaringBitmap, + deleted_old_frags: &RoaringBitmap, +) -> Result> { + if dataset.manifest.uses_stable_row_ids() { + let valid_old_row_ids = build_stable_row_id_filter(dataset, effective_old_frags).await?; + Ok(Some(OldIndexDataFilter::RowIds(valid_old_row_ids))) + } else { + Ok(Some(OldIndexDataFilter::Fragments { + to_keep: effective_old_frags.clone(), + to_remove: deleted_old_frags.clone(), + })) + } +} + +async fn load_unindexed_training_data( + dataset: &Dataset, + field_path: &str, + update_criteria: &lance_index::scalar::UpdateCriteria, + unindexed: &[Fragment], +) -> Result { + let fragments = if update_criteria.requires_old_data { + None + } else { + Some(unindexed.to_vec()) + }; + load_training_data( + dataset, + field_path, + &update_criteria.data_criteria, + fragments, + true, + None, + ) + .await +} + +/// Build a fresh, canonical (non-sharded) scalar index over `fragment_ids`, +/// reusing `reference_index`'s params and training criteria. +async fn rebuild_scalar_segment( + dataset: &Dataset, + reference_index: &Arc, + field_path: &str, + column_name: &str, + uuid: &str, + fragment_ids: Vec, +) -> Result { + let params = reference_index.derive_index_params()?; + let update_criteria = reference_index.update_criteria(); + let training_data = load_training_data( + dataset, + field_path, + &update_criteria.data_criteria, + None, + true, + Some(fragment_ids), + ) + .await?; + super::scalar::build_scalar_index( + dataset, + column_name, + uuid, + ¶ms, + true, + None, + Some(training_data), + Arc::new(NoopIndexBuildProgress), + ) + .await +} + +#[allow(clippy::too_many_arguments)] +async fn merge_scalar_indices<'a>( + dataset: Arc, + old_indices: &[&'a IndexMetadata], + unindexed: &[Fragment], + options: &OptimizeOptions, + index_type: IndexType, + field_path: &str, + column_name: &str, + base_unindexed_bitmap: RoaringBitmap, +) -> Result, RoaringBitmap, CreatedIndex)>> { + if old_indices.is_empty() { + return Err(Error::index( + "merge_scalar_indices: no previous index found".to_string(), + )); + } + + let num_to_merge = options + .num_indices_to_merge + .unwrap_or(1) + .min(old_indices.len()); + + // No new data + ≤1 old selected = rewriting one segment to itself. + if unindexed.is_empty() && num_to_merge <= 1 { + return Ok(None); + } + + let selected_old_indices = &old_indices[old_indices.len() - num_to_merge..]; + + // For the delta case (`selected` empty) the reference is purely + // for reading params; fall back to the last old index then. + let reference_idx = selected_old_indices + .first() + .copied() + .unwrap_or(old_indices[old_indices.len() - 1]); + let reference_index = dataset + .open_scalar_index( + field_path, + &reference_idx.uuid.to_string(), + &NoOpMetricsCollector, + ) + .await?; + + // Effective = bitmap ∩ live fragments; deleted = bitmap \ live fragments. + let mut effective_old_frags = RoaringBitmap::new(); + let mut deleted_old_frags = RoaringBitmap::new(); + for idx in selected_old_indices { + if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) { + effective_old_frags |= effective; + } + if let Some(deleted) = idx.deleted_fragment_bitmap(&dataset.fragment_bitmap) { + deleted_old_frags |= deleted; + } + } + + let mut frag_bitmap = base_unindexed_bitmap.clone(); + frag_bitmap |= &effective_old_frags; + let new_uuid = Uuid::new_v4(); + + // Scalar Index that expos an N:1 segment-merge primitive reachable without + // rescanning the dataset + let has_segment_merge_primitive = matches!(index_type, IndexType::BTree); + + // Merge new data into the existing segment(s) instead of rebuilding from + // scratch, when both hold: + // - `effective_old_frags`: the selected segments' coverage intersected + // with live fragments is non-empty, i.e. there is old data worth keeping. + // - `has_segment_merge_primitive` (Indices supports N:1 segments merge) OR + // `selected_old_indices.len() == 1` (any scalar type can `update` one). + // Otherwise (e.g. ≥2 selected segments of a type without an N:1 merge + // primitive) the index is rebuilt from scratch over `frag_bitmap`. + let can_merge_segments = !effective_old_frags.is_empty() + && (has_segment_merge_primitive || selected_old_indices.len() == 1); + + let created_index = if !can_merge_segments { + rebuild_scalar_segment( + dataset.as_ref(), + &reference_index, + field_path, + column_name, + &new_uuid.to_string(), + frag_bitmap.iter().collect(), + ) + .await? + } else { + let update_criteria = reference_index.update_criteria(); + let new_data_stream = + load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed) + .await?; + let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid.to_string())?; + let old_data_filter = + build_old_data_filter(dataset.as_ref(), &effective_old_frags, &deleted_old_frags) + .await?; + + match index_type { + IndexType::BTree => { + crate::index::scalar::btree::open_and_merge_segments( + dataset.as_ref(), + field_path, + selected_old_indices, + new_data_stream, + &new_store, + old_data_filter, + ) + .await? + } + _ => { + reference_index + .update(new_data_stream, &new_store, old_data_filter) + .await? + } + } + }; + + Ok(Some(( + new_uuid, + selected_old_indices.to_vec(), + frag_bitmap, + created_index, + ))) +} + async fn metadata_is_vector_index(dataset: &Dataset, index: &IndexMetadata) -> Result { if let Some(files) = &index.files { return Ok(files.iter().any(|file| file.path == INDEX_FILE_NAME)); @@ -339,7 +537,6 @@ pub async fn merge_indices_with_unindexed_frags<'a>( )) } } else { - let mut frag_bitmap = base_unindexed_bitmap.clone(); let mut indices = Vec::with_capacity(old_indices.len()); for idx in old_indices { match dataset @@ -515,105 +712,21 @@ pub async fn merge_indices_with_unindexed_frags<'a>( )) } it if it.is_scalar() => { - let num_to_merge = options - .num_indices_to_merge - .unwrap_or(1) - .min(old_indices.len()); - if unindexed.is_empty() && num_to_merge <= 1 { - return Ok(None); - } - - // Use effective bitmap (intersected with existing dataset fragments) - // to avoid carrying stale data from pruned indices. - let effective_old_frags: RoaringBitmap = old_indices - .iter() - .filter_map(|idx| idx.effective_fragment_bitmap(&dataset.fragment_bitmap)) - .fold(RoaringBitmap::new(), |mut acc, b| { - acc |= &b; - acc - }); - let deleted_old_frags: RoaringBitmap = old_indices - .iter() - .filter_map(|idx| idx.deleted_fragment_bitmap(&dataset.fragment_bitmap)) - .fold(RoaringBitmap::new(), |mut acc, b| { - acc |= &b; - acc - }); - frag_bitmap |= &effective_old_frags; - - let index = dataset - .open_scalar_index( - &field_path, - &old_indices[0].uuid.to_string(), - &NoOpMetricsCollector, - ) - .await?; - - let update_criteria = index.update_criteria(); - - let fragments = if update_criteria.requires_old_data { - None - } else { - Some(unindexed.to_vec()) - }; - let new_data_stream = load_training_data( - dataset.as_ref(), + let Some(result) = merge_scalar_indices( + dataset.clone(), + old_indices, + unindexed, + options, + it, &field_path, - &update_criteria.data_criteria, - fragments, - true, - None, + column.name.as_str(), + base_unindexed_bitmap, ) - .await?; - - let new_uuid = Uuid::new_v4(); - - let created_index = if effective_old_frags.is_empty() { - // Old data is fully stale (bitmap pruned to empty). Rebuild - // from scratch instead of merging stale entries. - let params = index.derive_index_params()?; - super::scalar::build_scalar_index( - dataset.as_ref(), - column.name.as_str(), - &new_uuid.to_string(), - ¶ms, - true, - None, - Some(new_data_stream), - Arc::new(NoopIndexBuildProgress), - ) - .await? - } else { - let new_store = - LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid.to_string())?; - let old_data_filter = if dataset.manifest.uses_stable_row_ids() { - // Stable row IDs are opaque IDs, so fragment-bit filtering on - // (row_id >> 32) is invalid. Build an exact allow-list from retained - // fragments' row-id sequences and use precise filtering. - let valid_old_row_ids = - build_stable_row_id_filter(dataset.as_ref(), &effective_old_frags) - .await?; - Some(OldIndexDataFilter::RowIds(valid_old_row_ids)) - } else { - // Address-style row IDs encode fragment_id in high 32 bits. - // Fragment bitmap filtering is valid and cheaper in this mode. - Some(OldIndexDataFilter::Fragments { - to_keep: effective_old_frags, - to_remove: deleted_old_frags, - }) - }; - index - .update(new_data_stream, &new_store, old_data_filter) - .await? + .await? + else { + return Ok(None); }; - - // TODO: don't hard-code index version - Ok(( - new_uuid, - vec![old_indices[old_indices.len() - 1]], - frag_bitmap, - created_index, - )) + Ok(result) } _ => Err(Error::index(format!( "Append index: invalid index type: {:?}", @@ -662,7 +775,7 @@ mod tests { use crate::dataset::builder::DatasetBuilder; use crate::dataset::optimize::compact_files; - use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteParams}; + use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; @@ -1219,6 +1332,384 @@ mod tests { ); } + #[tokio::test] + async fn test_optimize_btree_multi_segment_optimize_default() { + async fn query_id_count(dataset: &Dataset, id: &str) -> usize { + dataset + .scan() + .filter(&format!("id = '{}'", id)) + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); + let make_batch = |start: i32, end: i32| { + let ids = StringArray::from_iter_values((start..end).map(|i| format!("song-{i}"))); + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap() + }; + + // Three fragments of 64 rows each; each commits as its own BTree + // segment so optimize sees a multi-segment scalar logical index. + let reader = RecordBatchIterator::new( + vec![ + Ok(make_batch(0, 64)), + Ok(make_batch(64, 128)), + Ok(make_batch(128, 192)), + ], + schema.clone(), + ); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 3); + + let mut staged_segments = Vec::new(); + for fragment in &fragments { + let segment = crate::index::create::CreateIndexBuilder::new( + &mut dataset, + &["id"], + IndexType::BTree, + ¶ms, + ) + .name("id_idx".into()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged_segments.push(segment); + } + dataset + .commit_existing_index_segments("id_idx", "id", staged_segments) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 3 + ); + + let appended = RecordBatchIterator::new(vec![Ok(make_batch(192, 256))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 4); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + + // Reload from disk to ensure we're reading committed manifest state. + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // Each of these IDs lives in a distinct old segment / fragment. + // song-10 lives in fragment 0, song-80 in fragment 1, song-160 in + // fragment 2, and song-200 in the appended fragment. After optimize + // every row must still be reachable through the logical index, + // regardless of which segment absorbed the new data. + for id in ["song-10", "song-80", "song-160", "song-200"] { + assert_eq!( + query_id_count(&dataset, id).await, + 1, + "expected exactly one row for {id} after multi-segment optimize" + ); + } + + // `OptimizeOptions::default()` (= num_indices_to_merge: None) merges + // the newest segment with the unindexed fragment, like the + // inverted/vector default. The three old segments minus the merged one + // plus the new delta means three segments remain, and together they + // must still cover every dataset fragment without overlap. + let segments_after = dataset.load_indices_by_name("id_idx").await.unwrap(); + assert_eq!( + segments_after.len(), + 3, + "default optimize must merge one delta, not all segments, got {segments_after:?}" + ); + let mut covered = RoaringBitmap::new(); + for segment in &segments_after { + let bitmap = segment + .fragment_bitmap + .as_ref() + .expect("each segment should carry fragment coverage"); + assert!( + covered.is_disjoint(bitmap), + "post-optimize segments must not overlap, got {segments_after:?}" + ); + covered |= bitmap; + } + let mut expected = RoaringBitmap::new(); + for frag in dataset.get_fragments() { + expected.insert(frag.id() as u32); + } + assert_eq!( + covered, expected, + "post-optimize segments should cover every dataset fragment" + ); + } + + #[tokio::test] + async fn test_optimize_btree_optimize_append() { + async fn query_id_count(dataset: &Dataset, id: &str) -> usize { + dataset + .scan() + .filter(&format!("id = '{}'", id)) + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows() + } + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); + let make_batch = |start: i32, end: i32| { + let ids = StringArray::from_iter_values((start..end).map(|i| format!("song-{i}"))); + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).unwrap() + }; + + // Start with two fragments + two committed BTree segments. + let reader = RecordBatchIterator::new( + vec![Ok(make_batch(0, 64)), Ok(make_batch(64, 128))], + schema.clone(), + ); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let original_segment_uuids: Vec<_> = { + let mut staged = Vec::new(); + for fragment in dataset.get_fragments() { + let segment = crate::index::create::CreateIndexBuilder::new( + &mut dataset, + &["id"], + IndexType::BTree, + ¶ms, + ) + .name("id_idx".into()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(); + staged.push(segment); + } + let uuids = staged.iter().map(|s| s.uuid).collect::>(); + dataset + .commit_existing_index_segments("id_idx", "id", staged) + .await + .unwrap(); + uuids + }; + assert_eq!(original_segment_uuids.len(), 2); + + // Append a third fragment, leave it unindexed, then run append-mode optimize. + let appended = RecordBatchIterator::new(vec![Ok(make_batch(128, 192))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + + // Read fresh from disk to make sure we're inspecting committed state. + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // append() must preserve every original old segment unchanged and add + // exactly one new segment covering only the newly appended fragments. + let committed = dataset.load_indices_by_name("id_idx").await.unwrap(); + let committed_uuids: std::collections::HashSet<_> = + committed.iter().map(|idx| idx.uuid).collect(); + for original in &original_segment_uuids { + assert!( + committed_uuids.contains(original), + "append() must not remove pre-existing segment {original}, \ + but the committed UUIDs are {committed_uuids:?}" + ); + } + assert_eq!( + committed.len(), + original_segment_uuids.len() + 1, + "append() should add exactly one new delta segment, got {committed:?}" + ); + let new_segment = committed + .iter() + .find(|idx| !original_segment_uuids.contains(&idx.uuid)) + .expect("append() must add a new delta segment"); + let new_segment_frags: Vec<_> = new_segment + .fragment_bitmap + .as_ref() + .unwrap() + .iter() + .collect(); + // The appended fragment should be the only one covered by the new delta; + // old segments retain their own coverage. + assert_eq!(new_segment_frags.len(), 1); + + // Sanity check: queries across all fragments still return their rows. + for id in ["song-10", "song-100", "song-160"] { + assert_eq!(query_id_count(&dataset, id).await, 1, "missing row {id}"); + } + } + + #[tokio::test] + async fn test_optimize_bitmap_index_append() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "category", + DataType::Utf8, + false, + )])); + let make_batch = |labels: &[&str]| { + let arr = StringArray::from_iter_values(labels.iter().copied()); + RecordBatch::try_new(schema.clone(), vec![Arc::new(arr)]).unwrap() + }; + + // One fragment + one Bitmap segment. + let reader = + RecordBatchIterator::new(vec![Ok(make_batch(&["a", "b", "a", "c"]))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 4, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::Bitmap); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("cat_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + let original_uuid = { + let committed = dataset.load_indices_by_name("cat_idx").await.unwrap(); + assert_eq!(committed.len(), 1); + committed[0].uuid + }; + + // Append a second fragment, leave it unindexed, then optimize with + // `append()` (= num_indices_to_merge: Some(0)). + let appended = + RecordBatchIterator::new(vec![Ok(make_batch(&["b", "d", "d", "a"]))], schema.clone()); + let mut dataset = Dataset::write( + appended, + test_uri, + Some(WriteParams { + max_rows_per_file: 4, + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // append() (= num_indices_to_merge: Some(0)) is now honored uniformly: + // Bitmap, like BTree, must keep the original segment untouched and add + // exactly one delta segment covering only the appended fragment. + let committed = dataset.load_indices_by_name("cat_idx").await.unwrap(); + assert_eq!( + committed.len(), + 2, + "Bitmap optimize append() must add a delta segment, not merge, got {committed:?}" + ); + assert!( + committed.iter().any(|idx| idx.uuid == original_uuid), + "append() must preserve the pre-existing segment {original_uuid}, got {committed:?}" + ); + let new_segment = committed + .iter() + .find(|idx| idx.uuid != original_uuid) + .expect("append() must add a new delta segment"); + let new_segment_frags: std::collections::BTreeSet = new_segment + .fragment_bitmap + .as_ref() + .expect("delta Bitmap should carry fragment coverage") + .iter() + .collect(); + assert_eq!( + new_segment_frags, + [1u32].into_iter().collect(), + "the delta segment must cover only the appended fragment" + ); + + // Data correctness: a value that lives only in the appended fragment + // is queryable through the (now multi-segment) index. + let rows = dataset + .scan() + .filter("category = 'd'") + .unwrap() + .project(&["category"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 2, "value 'd' lives in appended fragment"); + } + #[tokio::test] async fn test_optimize_btree_keeps_rows_with_stable_row_ids_after_compaction() { async fn query_id_count(dataset: &Dataset, id: &str) -> usize { diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index ff131c5ce23..ce8e65d8356 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -508,7 +508,7 @@ impl<'a> CreateIndexBuilder<'a> { } else { vec![] }; - let transaction = if uses_segment_commit_path(self.index_type, &new_idx.name, self.params) { + let transaction = if uses_segment_commit_path(self.index_type, self.params) { let field_id = *new_idx.fields.first().ok_or_else(|| { Error::internal(format!( "Index '{}' is missing field ids after build", @@ -562,6 +562,13 @@ impl<'a> CreateIndexBuilder<'a> { } } +fn is_btree_scalar_params(params: &dyn IndexParams) -> bool { + params + .as_any() + .downcast_ref::() + .is_some_and(|p| p.index_type.eq_ignore_ascii_case("btree")) +} + /// Validate that a user-supplied `index_uuid` is permitted for this build. fn ensure_index_uuid_allowed( index_type: IndexType, @@ -588,26 +595,35 @@ fn ensure_index_uuid_allowed( Ok(()) } -fn uses_segment_commit_path( - index_type: IndexType, - index_name: &str, - params: &dyn IndexParams, -) -> bool { - if index_name != LANCE_VECTOR_INDEX { - return false; +fn uses_segment_commit_path(index_type: IndexType, params: &dyn IndexParams) -> bool { + let params_family = params.index_name(); + + if params_family == LANCE_VECTOR_INDEX + && matches!( + index_type, + IndexType::Vector + | IndexType::IvfPq + | IndexType::IvfSq + | IndexType::IvfFlat + | IndexType::IvfRq + | IndexType::IvfHnswFlat + | IndexType::IvfHnswPq + | IndexType::IvfHnswSq + ) + && params.as_any().is::() + { + return true; + } + + if params_family == LANCE_SCALAR_INDEX { + match index_type { + IndexType::BTree => return true, + IndexType::Scalar if is_btree_scalar_params(params) => return true, + _ => {} + } } - matches!( - index_type, - IndexType::Vector - | IndexType::IvfPq - | IndexType::IvfSq - | IndexType::IvfFlat - | IndexType::IvfRq - | IndexType::IvfHnswFlat - | IndexType::IvfHnswPq - | IndexType::IvfHnswSq - ) && params.as_any().is::() + false } impl<'a> IntoFuture for CreateIndexBuilder<'a> { @@ -1882,6 +1898,143 @@ mod tests { assert_eq!(results.num_rows(), 20); } + #[tokio::test] + async fn test_btree_merge_existing_index_segments() { + use datafusion::common::ScalarValue; + use lance_index::scalar::{SargableQuery, SearchResult}; + use std::ops::Bound; + + // Open `segment` and count rows whose `id` falls in `[lo, hi)`. + async fn count_in_range( + dataset: &Dataset, + segment: &IndexMetadata, + lo: i32, + hi: i32, + ) -> usize { + let field_path = dataset.schema().field_path(segment.fields[0]).unwrap(); + let index = crate::index::scalar::open_scalar_index( + dataset, + &field_path, + segment, + &NoOpMetricsCollector, + ) + .await + .unwrap(); + let query = SargableQuery::Range( + Bound::Included(ScalarValue::Int32(Some(lo))), + Bound::Excluded(ScalarValue::Int32(Some(hi))), + ); + match index.search(&query, &NoOpMetricsCollector).await.unwrap() { + SearchResult::Exact(row_addrs) => { + row_addrs.true_rows().row_addrs().unwrap().count() + } + other => panic!("expected exact result, got {other:?}"), + } + } + + let tmpdir = TempStrDir::default(); + let dataset_uri = format!("file://{}", tmpdir.as_str()); + + // 128 rows across two 64-row fragments. Stable row ids so the + // retired-fragment filter below exercises the exact row-id allow-list. + let reader = gen_batch() + .col("id", lance_datagen::array::step::()) + .into_reader_rows( + lance_datagen::RowCount::from(64), + lance_datagen::BatchCount::from(2), + ); + let mut dataset = Dataset::write( + reader, + &dataset_uri, + Some(WriteParams { + max_rows_per_file: 64, + mode: WriteMode::Overwrite, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // One staged BTree segment per fragment, committed as a multi-segment + // logical index. + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + let mut staged = Vec::new(); + for fragment in dataset.get_fragments() { + staged.push( + CreateIndexBuilder::new(&mut dataset, &["id"], IndexType::BTree, ¶ms) + .name("id_btree".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("id_btree", "id", staged) + .await + .unwrap(); + + // Phase 1 — healthy merge: the two per-fragment segments consolidate + // into a single canonical segment covering both fragments, and a range + // spanning both (ids 50..100) returns every matching row. + let merged = dataset + .merge_existing_index_segments(dataset.load_indices_by_name("id_btree").await.unwrap()) + .await + .unwrap(); + assert_eq!( + merged.fragment_bitmap.as_ref().unwrap(), + &roaring::RoaringBitmap::from_iter([0u32, 1]) + ); + assert!( + merged + .index_details + .as_ref() + .unwrap() + .type_url + .ends_with("BTreeIndexDetails") + ); + assert_eq!(count_in_range(&dataset, &merged, 50, 100).await, 50); + + // Phase 2 — retire fragment 0: delete >10% of its rows so compaction + // rewrites only frag 0 (frag 1 has no deletions and is at target size). + // The committed per-fragment segment now claims a fragment the dataset + // no longer has. + dataset.delete("id < 16").await.unwrap(); + crate::dataset::optimize::compact_files( + &mut dataset, + crate::dataset::optimize::CompactionOptions { + target_rows_per_fragment: 64, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + let live_frags: roaring::RoaringBitmap = dataset + .get_fragments() + .iter() + .map(|f| f.id() as u32) + .collect(); + assert!(!live_frags.contains(0), "compaction should retire frag 0"); + + // Filtered merge: coverage drops the retired fragment but keeps the + // live one, and the merged page data does not leak the retired row ids + // (ids < 16 lived only in frag 0, so the range now returns nothing). + let merged = dataset + .merge_existing_index_segments(dataset.load_indices_by_name("id_btree").await.unwrap()) + .await + .unwrap(); + let coverage = merged.fragment_bitmap.as_ref().unwrap(); + assert!(!coverage.contains(0), "must drop retired frag 0"); + assert!(coverage.contains(1), "must keep live frag 1"); + assert_eq!( + count_in_range(&dataset, &merged, 0, 16).await, + 0, + "must filter retired-fragment row ids" + ); + } + #[tokio::test] async fn test_commit_existing_index_supports_local_hnsw_segments() { let tmpdir = TempStrDir::default(); @@ -2143,39 +2296,38 @@ mod tests { // Load indices after optimization let indices_after = dataset.load_indices().await.unwrap(); - // There should be 3 indices: - // 1. one scalar index with name "id_idx", and the bitmap is [0,1] - // 2. one delta vector index with name "vector_idx", and the bitmap is [0] - // 3. one delta vector index with name "vector_idx", and the bitmap is [1] - assert_eq!(indices_after.len(), 3, "{:?}", indices_after); - let id_idx = indices_after + // After unifying scalar optimize, `OptimizeOptions::append()` honors + // `Some(0)` for BTree the same way it does for vector: keep the old + // segment, add a delta for the unindexed fragment. So we now expect: + // 1. id_idx old segment, bitmap [0] + // 2. id_idx delta segment, bitmap [1] + // 3. vector_idx old segment, bitmap [0] + // 4. vector_idx delta segment, bitmap [1] + // Previously BTree silently merged into 1 segment because legacy + // scalar ignored `num_indices_to_merge`. + assert_eq!(indices_after.len(), 4, "{:?}", indices_after); + let id_indices = indices_after .iter() - .find(|idx| idx.name == "id_idx") - .unwrap(); + .filter(|idx| idx.name == "id_idx") + .collect::>(); let vector_indices = indices_after .iter() .filter(|idx| idx.name == "vector_idx") .collect::>(); - assert!( - id_idx - .fragment_bitmap - .as_ref() - .unwrap() - .contains_range(0..2) - && id_idx.fragment_bitmap.as_ref().unwrap().len() == 2 - ); - assert_eq!(vector_indices.len(), 2); - assert!( - vector_indices - .iter() - .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(0) - && idx.fragment_bitmap.as_ref().unwrap().len() == 1) - ); - assert!( - vector_indices - .iter() - .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(1) - && idx.fragment_bitmap.as_ref().unwrap().len() == 1) - ); + for indices in [&id_indices, &vector_indices] { + assert_eq!(indices.len(), 2); + assert!( + indices + .iter() + .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(0) + && idx.fragment_bitmap.as_ref().unwrap().len() == 1) + ); + assert!( + indices + .iter() + .any(|idx| idx.fragment_bitmap.as_ref().unwrap().contains(1) + && idx.fragment_bitmap.as_ref().unwrap().len() == 1) + ); + } } } diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 05ddb273a93..92b06f0a1a5 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -5,6 +5,7 @@ //! pub(crate) mod bitmap; +pub(crate) mod btree; pub(crate) mod inverted; pub use inverted::{load_segment_details, load_segments}; diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs new file mode 100644 index 00000000000..34534f6811b --- /dev/null +++ b/rust/lance/src/index/scalar/btree.rs @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +#![allow(clippy::redundant_pub_crate)] + +//! BTree-specific helpers for the segmented index workflow. +use std::sync::Arc; + +use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use lance_core::ROW_ID; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::pbold::BTreeIndexDetails; +use lance_index::scalar::btree::BTreeIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::registry::VALUE_COLUMN_NAME; +use lance_index::scalar::{CreatedIndex, OldIndexDataFilter}; +use lance_table::format::IndexMetadata; +use roaring::RoaringBitmap; +use uuid::Uuid; + +use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; + +/// Build a row-empty `new_data` stream for the BTree merge API. +fn empty_btree_update_stream( + dataset: &Dataset, + field_id: i32, +) -> Result { + let field = dataset.schema().field_by_id(field_id).ok_or_else(|| { + Error::invalid_input(format!( + "merge_existing_index_segments: field id {} does not exist", + field_id + )) + })?; + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new(VALUE_COLUMN_NAME, field.data_type(), true), + ArrowField::new(ROW_ID, arrow_schema::DataType::UInt64, false), + ])); + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::empty(), + ))) +} + +fn ensure_btree_details(segment: &IndexMetadata) -> Result<()> { + if let Some(details) = segment.index_details.as_ref() + && !details.type_url.ends_with("BTreeIndexDetails") + { + return Err(Error::invalid_input(format!( + "Segment '{}' is not a BTree segment (details type_url = '{}')", + segment.uuid, details.type_url + ))); + } + Ok(()) +} + +/// Open the given BTree `segments` and k-way merge their already-sorted page +/// data, together with `new_data`, into a single canonical BTree written to +/// `new_store`. +pub(crate) async fn open_and_merge_segments( + dataset: &Dataset, + field_path: &str, + segments: &[&IndexMetadata], + new_data: SendableRecordBatchStream, + new_store: &LanceIndexStore, + old_data_filter: Option, +) -> Result { + let mut source_indices = Vec::with_capacity(segments.len()); + for &segment in segments { + let scalar_index = + super::open_scalar_index(dataset, field_path, segment, &NoOpMetricsCollector).await?; + let btree = scalar_index + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::index(format!( + "BTree merge: expected BTree segment {}, got {:?}", + segment.uuid, + scalar_index.index_type() + )) + })?; + source_indices.push(Arc::new(btree.clone())); + } + BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filter).await +} + +/// Merge one caller-defined group of source BTree segments into a single +/// physical segment. +pub(crate) async fn merge_segments( + dataset: &Dataset, + segments: Vec, +) -> Result { + if segments.is_empty() { + return Err(Error::index("No segment metadata was provided".to_string())); + } + + for segment in &segments { + ensure_btree_details(segment)?; + } + + // All source segments must belong to the same column. + let reference_fields = segments[0].fields.as_slice(); + for segment in segments.iter().skip(1) { + if segment.fields.as_slice() != reference_fields { + return Err(Error::invalid_input(format!( + "BTree merge_segments: segment {} has fields {:?}, expected {:?}", + segment.uuid, segment.fields, reference_fields, + ))); + } + } + + let field_id = *segments[0].fields.first().ok_or_else(|| { + Error::invalid_input(format!( + "CreateIndex: segment {} is missing field ids", + segments[0].uuid + )) + })?; + let field_path = dataset.schema().field_path(field_id)?; + + // Intersect each segment's stored bitmap with the dataset's current + // fragments so we don't claim coverage on IDs that compaction or pruning + // has already retired. + let dataset_fragments = dataset.fragment_bitmap.as_ref(); + let mut effective_old_frags = RoaringBitmap::new(); + let mut deleted_old_frags = RoaringBitmap::new(); + for segment in &segments { + if segment.fragment_bitmap.is_none() { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + ))); + } + if let Some(effective) = segment.effective_fragment_bitmap(dataset_fragments) { + effective_old_frags |= effective; + } + if let Some(deleted) = segment.deleted_fragment_bitmap(dataset_fragments) { + deleted_old_frags |= deleted; + } + } + + let fragment_bitmap = effective_old_frags.clone(); + let old_data_filter = crate::index::append::build_old_data_filter( + dataset, + &effective_old_frags, + &deleted_old_frags, + ) + .await?; + + let output_uuid = Uuid::new_v4(); + let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid.to_string())?; + // Pure segment consolidation: no dataset scan, so `new_data` is an empty + // stream and the merge is driven entirely by the source page data. + let empty_new_data = empty_btree_update_stream(dataset, field_id)?; + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); + let created_index = open_and_merge_segments( + dataset, + &field_path, + &segment_refs, + empty_new_data, + &new_store, + old_data_filter, + ) + .await?; + + if !created_index + .index_details + .type_url + .ends_with("BTreeIndexDetails") + { + return Err(Error::internal(format!( + "merge_existing_index_segments: BTree merge produced unexpected details type_url '{}'", + created_index.index_details.type_url + ))); + } + debug_assert_eq!( + created_index.index_details, + prost_types::Any::from_msg(&BTreeIndexDetails::default()).unwrap(), + ); + + Ok(IndexMetadata { + uuid: output_uuid, + name: segments[0].name.clone(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(fragment_bitmap), + index_details: Some(Arc::new(created_index.index_details)), + index_version: created_index.index_version as i32, + created_at: Some(chrono::Utc::now()), + base_id: None, + files: created_index.files, + }) +} From 6eba1835feb47f4fe58524d5d196caf0ed8a9f6f Mon Sep 17 00:00:00 2001 From: ForwardXu Date: Fri, 5 Jun 2026 17:55:13 +0800 Subject: [PATCH 035/177] feat(lance-io): add GooseFS object store provider (#7034) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes: #7025 ## What Lance currently supports S3, GCS, Azure and local filesystem as storage backends. This PR adds native [GooseFS](https://cloud.tencent.com/document/product/436/56412) support, enabling Lance to read/write datasets stored on GooseFS distributed caching filesystem via gRPC through OpenDAL's `services-goosefs` (opendal v0.57). This adds a `GooseFsStoreProvider` that implements the `ObjectStoreProvider` trait, wires the new `goosefs` feature through `lance-io`, `lance`, and `lance-namespace-impls` crates, and includes E2E integration tests. URL format: `goosefs://host:port/path` ## Changes - Add `GooseFsStoreProvider` in `rust/lance-io/src/object_store/providers/goosefs.rs` that resolves GooseFS Master address with priority: `storage_options` > env var > URL authority. - Default Master gRPC port 9200 when not specified in URL. - HA support via comma-separated addresses (e.g. `addr1:9200,addr2:9200,addr3:9200`). - Configurable options via `storage_options` or environment variables: * `goosefs_master_addr` / `GOOSEFS_MASTER_ADDR` * `goosefs_write_type` / `GOOSEFS_WRITE_TYPE` * `goosefs_block_size` / `GOOSEFS_BLOCK_SIZE` * `goosefs_chunk_size` / `GOOSEFS_CHUNK_SIZE` * `goosefs_auth_type` / `GOOSEFS_AUTH_TYPE` (nosasl / simple) * `goosefs_auth_username` / `GOOSEFS_AUTH_USERNAME` - URL path mapped to OpenDAL root, `extract_path` returns empty to avoid path duplication. - Per-cluster cache isolation via `object_store_prefix` (formatted as `goosefs$host:port`). - Wire `goosefs` feature flag through `lance-io`, `lance`, and `lance-namespace-impls` Cargo.toml files. ## Notes - Architecture is 3-layer: * Layer 1: `GooseFsStoreProvider` (this PR) — Lance `ObjectStoreProvider` implementation * Layer 2: OpenDAL `services-goosefs` — implements the `Access` trait * Layer 3: `goosefs-client-rs` — gRPC client to GooseFS Master - Requires OpenDAL >= 0.57 which includes `services-goosefs` on crates.io. - All tests are gated behind `#[ignore]` and require a running GooseFS cluster. ## Tests - E2E integration tests in `rust/lance-io/tests/goosefs_integration.rs` covering: * OpenDAL direct: write/read, list, stat operations * Lance ObjectStore I/O: put/get, list, large file (5MB) read/write * Advanced write modes: `PutMode::Create`, `rename_if_not_exists` - Run with: `cargo test -p lance-io --features "goosefs goosefs-test" --test goosefs_integration -- --ignored --nocapture --test-threads=1` --- Cargo.lock | 117 +++++++- java/lance-jni/Cargo.lock | 128 ++++++++- java/lance-jni/Cargo.toml | 2 +- python/Cargo.lock | 124 +++++++- python/Cargo.toml | 3 +- rust/examples/Cargo.toml | 2 +- rust/lance-io/Cargo.toml | 2 + rust/lance-io/src/object_store.rs | 2 +- rust/lance-io/src/object_store/providers.rs | 4 + .../src/object_store/providers/goosefs.rs | 266 +++++++++++++++++ rust/lance-io/tests/goosefs_integration.rs | 271 ++++++++++++++++++ rust/lance-namespace-impls/Cargo.toml | 5 + rust/lance/Cargo.toml | 6 +- 13 files changed, 912 insertions(+), 20 deletions(-) create mode 100644 rust/lance-io/src/object_store/providers/goosefs.rs create mode 100644 rust/lance-io/tests/goosefs_integration.rs diff --git a/Cargo.lock b/Cargo.lock index 028bfc79cb2..ff70a95ae1a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3526,6 +3526,30 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" version = "0.4.14" @@ -3702,6 +3726,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3823,6 +3858,19 @@ dependencies = [ "webpki-roots 1.0.7", ] +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "hyper-tls" version = "0.6.0" @@ -4503,6 +4551,7 @@ dependencies = [ "parquet", "permutation", "pin-project", + "pprof", "pretty_assertions", "prost", "prost-build", @@ -4985,6 +5034,7 @@ name = "lance-namespace-impls" version = "8.0.0-beta.3" dependencies = [ "arrow", + "arrow-array", "arrow-ipc", "arrow-schema", "async-trait", @@ -4997,6 +5047,7 @@ dependencies = [ "futures", "hmac 0.12.1", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", @@ -5005,6 +5056,7 @@ dependencies = [ "lance-table", "log", "object_store", + "opendal", "quick-xml 0.38.4", "rand 0.9.4", "reqwest 0.12.28", @@ -6003,6 +6055,7 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", @@ -6169,6 +6222,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -8083,9 +8150,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64 0.22.1", "bs58", @@ -8103,9 +8170,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling 0.23.0", "proc-macro2", @@ -9009,6 +9076,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "h2", + "http 1.4.1", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -9017,9 +9123,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index bc93593ed24..8dde29c552a 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2915,6 +2915,30 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" version = "0.4.14" @@ -3067,6 +3091,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3185,6 +3220,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -4217,9 +4266,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eefb02ded2c3d4b6b60669bb74822d9fa628e144fc748c79ee31f13f566e87b" +checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" dependencies = [ "reqwest 0.12.28", "serde", @@ -4877,6 +4926,7 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", @@ -5043,6 +5093,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -6078,6 +6142,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", + "webpki-roots", ] [[package]] @@ -6577,9 +6642,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64", "bs58", @@ -6597,9 +6662,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling", "proc-macro2", @@ -7275,6 +7340,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "h2", + "http 1.4.1", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -7283,9 +7387,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -7827,6 +7934,15 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "2.1.2" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 090aedcae2f..e8bf842dbdc 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -23,7 +23,7 @@ lance-linalg = { path = "../../rust/lance-linalg" } lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } -lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "dir-goosefs"] } lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } lance-table = { path = "../../rust/lance-table" } diff --git a/python/Cargo.lock b/python/Cargo.lock index 9a900dbd26b..908c2a21423 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -3274,6 +3274,30 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "goosefs-sdk" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae079b88ffe7772d12cfc5c40a5a324babb357893d95b5e3a22ae857f236c5f" +dependencies = [ + "async-trait", + "bytes", + "dashmap", + "hostname", + "prost", + "prost-types", + "rand 0.9.4", + "reqwest 0.12.28", + "serde", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tonic", + "tonic-prost", + "tracing", + "uuid", +] + [[package]] name = "h2" version = "0.4.14" @@ -3426,6 +3450,17 @@ dependencies = [ "digest 0.11.3", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if 1.0.4", + "libc", + "windows-link", +] + [[package]] name = "http" version = "0.2.12" @@ -3544,6 +3579,20 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", ] [[package]] @@ -5341,6 +5390,7 @@ dependencies = [ "opendal-service-azdls", "opendal-service-cos", "opendal-service-gcs", + "opendal-service-goosefs", "opendal-service-hf", "opendal-service-oss", "opendal-service-s3", @@ -5507,6 +5557,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "opendal-service-goosefs" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e43048bde419947ba826fbdc2f134d6c03f44ebf48bd33a03b72f9fc45fcb4" +dependencies = [ + "bytes", + "goosefs-sdk", + "log", + "opendal-core", + "serde", + "tokio", +] + [[package]] name = "opendal-service-hf" version = "0.57.0" @@ -6776,6 +6840,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", + "webpki-roots", ] [[package]] @@ -7305,9 +7370,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e72c1c2cb7b223fafb600a619537a871c2818583d619401b785e7c0b746ccde2" +checksum = "76a5c54c7310e7b8b9577c286d7e399ddd876c3e12b3ed917a8aabc4b96e9e8c" dependencies = [ "base64", "bs58", @@ -7325,9 +7390,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.20.0" +version = "3.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b90c488738ecb4fb0262f41f43bc40efc5868d9fb744319ddf5f5317f417bfac" +checksum = "84d57bc0c8b9a17920c178daa6bb924850d54a9c97ab45194bb8c17ad66bb660" dependencies = [ "darling 0.23.0", "proc-macro2", @@ -8057,6 +8122,45 @@ dependencies = [ "winnow", ] +[[package]] +name = "tonic" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac2a5518c70fa84342385732db33fb3f44bc4cc748936eb5833d2df34d6445ef" +dependencies = [ + "async-trait", + "base64", + "bytes", + "h2", + "http 1.4.1", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50849f68853be452acf590cde0b146665b8d507b3b8af17261df47e02c209ea0" +dependencies = [ + "bytes", + "prost", + "tonic", +] + [[package]] name = "tower" version = "0.5.3" @@ -8065,9 +8169,12 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", + "indexmap 2.14.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", "tracing", @@ -8647,6 +8754,15 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whoami" version = "2.1.2" diff --git a/python/Cargo.toml b/python/Cargo.toml index dc78368145a..e96e8862329 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -31,6 +31,7 @@ half = { version = "2.5", default-features = false, features = [ "std", ] } lance = { path = "../rust/lance", features = [ + "goosefs", "dynamodb", "substrait", ] } @@ -46,7 +47,7 @@ lance-index = { path = "../rust/lance-index", features = [ lance-io = { path = "../rust/lance-io" } lance-linalg = { path = "../rust/lance-linalg" } lance-namespace = { path = "../rust/lance-namespace" } -lance-namespace-impls = { path = "../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "dir-goosefs"] } lance-table = { path = "../rust/lance-table" } lance-datafusion = { path = "../rust/lance-datafusion" } libc = "0.2.176" diff --git a/rust/examples/Cargo.toml b/rust/examples/Cargo.toml index 3a1ce0ea03c..a4e760f8cbe 100644 --- a/rust/examples/Cargo.toml +++ b/rust/examples/Cargo.toml @@ -38,7 +38,7 @@ arrow-select = { workspace = true } clap = { workspace = true, features = ["derive"] } itertools = { workspace = true } futures = { workspace = true } -lance = { workspace = true, features = ["aws", "azure", "gcp", "oss", "huggingface", "tencent"] } +lance = { workspace = true, features = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "goosefs"] } lance-index = { workspace = true } lance-core = { workspace = true } lance-linalg = { workspace = true } diff --git a/rust/lance-io/Cargo.toml b/rust/lance-io/Cargo.toml index d1aabff3f7e..d94b811cec0 100644 --- a/rust/lance-io/Cargo.toml +++ b/rust/lance-io/Cargo.toml @@ -69,10 +69,12 @@ harness = false [features] default = ["aws", "azure", "gcp"] gcs-test = [] +goosefs-test = [] gcp = ["object_store/gcp", "dep:opendal", "opendal/services-gcs", "dep:object_store_opendal"] aws = ["object_store/aws", "dep:aws-config", "dep:aws-credential-types", "dep:opendal", "opendal/services-s3", "dep:object_store_opendal"] azure = ["object_store/azure", "dep:opendal", "opendal/services-azblob", "opendal/services-azdls", "dep:object_store_opendal"] oss = ["dep:opendal", "opendal/services-oss", "dep:object_store_opendal"] +goosefs = ["dep:opendal", "opendal/services-goosefs", "dep:object_store_opendal"] tencent = ["dep:opendal", "opendal/services-cos", "dep:object_store_opendal"] huggingface = ["dep:opendal", "opendal/services-huggingface", "dep:object_store_opendal"] tos = ["dep:opendal", "opendal/services-tos", "dep:object_store_opendal"] diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index b1ba3d45d8a..1a4fd18c01e 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -838,7 +838,7 @@ impl ObjectStore { .common_prefixes .iter() .chain(output.objects.iter().map(|o| &o.location)) - .map(|s| s.filename().unwrap().to_string()) + .filter_map(|s| s.filename().map(|f| f.to_string())) .collect()) } diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index aafe665cfe1..45ac30a757a 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -24,6 +24,8 @@ pub mod aws; pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; +#[cfg(feature = "goosefs")] +pub mod goosefs; #[cfg(feature = "huggingface")] pub mod huggingface; pub mod local; @@ -327,6 +329,8 @@ impl Default for ObjectStoreRegistry { } #[cfg(feature = "gcp")] providers.insert("gs".into(), Arc::new(gcp::GcsStoreProvider)); + #[cfg(feature = "goosefs")] + providers.insert("goosefs".into(), Arc::new(goosefs::GooseFsStoreProvider)); #[cfg(feature = "oss")] providers.insert("oss".into(), Arc::new(oss::OssStoreProvider)); #[cfg(feature = "tencent")] diff --git a/rust/lance-io/src/object_store/providers/goosefs.rs b/rust/lance-io/src/object_store/providers/goosefs.rs new file mode 100644 index 00000000000..d6173571551 --- /dev/null +++ b/rust/lance-io/src/object_store/providers/goosefs.rs @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; +use std::sync::Arc; + +use object_store::path::Path; +use object_store_opendal::OpendalStore; +use opendal::{Operator, services::GooseFs}; +use url::Url; + +use crate::object_store::{ + DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, ObjectStore, + ObjectStoreParams, ObjectStoreProvider, StorageOptions, +}; +use lance_core::error::{Error, Result}; + +/// Default GooseFS Master gRPC port. +const DEFAULT_GOOSEFS_PORT: u16 = 9200; + +/// GooseFS object store provider. +/// +/// Uses OpenDAL's GooseFs service to access GooseFS via gRPC. +/// URL format: `goosefs://host:port/path` +/// +/// Where: +/// - `host:port` is the GooseFS Master address (default port: 9200) +/// - `/path` is the filesystem path within GooseFS +/// +/// Configuration priority: storage_options > environment variables > URL authority > defaults +#[derive(Default, Debug)] +pub struct GooseFsStoreProvider; + +impl GooseFsStoreProvider { + /// Resolve the GooseFS Master address from storage_options, environment, or URL. + /// + /// Priority: + /// 1. `storage_options["goosefs_master_addr"]` (supports HA: "addr1:port,addr2:port") + /// 2. `GOOSEFS_MASTER_ADDR` environment variable + /// 3. URL authority (host:port from the URL) + fn resolve_master_addr(url: &Url, storage_options: &StorageOptions) -> Result { + // 1. storage_options + if let Some(addr) = storage_options + .0 + .get("goosefs_master_addr") + .filter(|v| !v.is_empty()) + { + return Ok(addr.clone()); + } + + // 2. Environment variable + if let Ok(addr) = std::env::var("GOOSEFS_MASTER_ADDR") + && !addr.is_empty() + { + return Ok(addr); + } + + // 3. URL authority + let host = url.host_str().ok_or_else(|| { + Error::invalid_input( + "GooseFS URL must contain a master address (host), e.g. goosefs://host:port/path", + ) + })?; + + let port = url.port().unwrap_or(DEFAULT_GOOSEFS_PORT); + Ok(format!("{}:{}", host, port)) + } + + /// Resolve a storage option from storage_options or environment variable. + fn resolve_option( + storage_options: &StorageOptions, + option_key: &str, + env_key: &str, + ) -> Option { + storage_options + .0 + .get(option_key) + .cloned() + .or_else(|| std::env::var(env_key).ok()) + .filter(|v| !v.is_empty()) + } +} + +#[async_trait::async_trait] +impl ObjectStoreProvider for GooseFsStoreProvider { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { + let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); + + // Resolve master address + let master_addr = Self::resolve_master_addr(&base_path, &storage_options)?; + + // Extract root path from URL + let root = base_path.path().to_string(); + + // Build OpenDAL config map + let mut config_map: HashMap = HashMap::new(); + config_map.insert("master_addr".to_string(), master_addr); + + if !root.is_empty() && root != "/" { + config_map.insert("root".to_string(), root); + } + + // Optional: write_type + if let Some(wt) = + Self::resolve_option(&storage_options, "goosefs_write_type", "GOOSEFS_WRITE_TYPE") + { + config_map.insert("write_type".to_string(), wt); + } + + // Optional: block_size (for GooseFS, not Lance block_size) + if let Some(bs) = + Self::resolve_option(&storage_options, "goosefs_block_size", "GOOSEFS_BLOCK_SIZE") + { + config_map.insert("block_size".to_string(), bs); + } + + // Optional: chunk_size + if let Some(cs) = + Self::resolve_option(&storage_options, "goosefs_chunk_size", "GOOSEFS_CHUNK_SIZE") + { + config_map.insert("chunk_size".to_string(), cs); + } + + // Optional: auth_type (nosasl / simple) + if let Some(at) = + Self::resolve_option(&storage_options, "goosefs_auth_type", "GOOSEFS_AUTH_TYPE") + { + config_map.insert("auth_type".to_string(), at); + } + + // Optional: auth_username (used in SIMPLE auth mode) + if let Some(au) = Self::resolve_option( + &storage_options, + "goosefs_auth_username", + "GOOSEFS_AUTH_USERNAME", + ) { + config_map.insert("auth_username".to_string(), au); + } + + // Create OpenDAL Operator with GooseFS service + let operator = Operator::from_iter::(config_map) + .map_err(|e| { + Error::invalid_input(format!("Failed to create GooseFS operator: {:?}", e)) + })? + .finish(); + + // Wrap as object_store::ObjectStore via OpendalStore bridge + let opendal_store = Arc::new(OpendalStore::new(operator)); + + Ok(ObjectStore { + scheme: "goosefs".to_string(), + inner: opendal_store, + block_size, + max_iop_size: *DEFAULT_MAX_IOP_SIZE, + use_constant_size_upload_parts: params.use_constant_size_upload_parts, + list_is_lexically_ordered: params.list_is_lexically_ordered.unwrap_or(false), + io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, + download_retry_count: storage_options.download_retry_count(), + io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, + }) + } + + /// Extract the path relative to the root of the GooseFS filesystem. + /// + /// For GooseFS, the entire URL path is set as the OpenDAL `root` in `new_store`, + /// so the relative path returned here must be empty to avoid path duplication. + /// + /// `goosefs://host:port/data/file.lance` → root="/data/file.lance", extract_path="" + fn extract_path(&self, _url: &Url) -> Result { + Ok(Path::from("")) + } + + /// Calculate the object store prefix for caching. + /// + /// Format: `goosefs$host:port` + /// This ensures different GooseFS clusters get separate caches. + fn calculate_object_store_prefix( + &self, + url: &Url, + _storage_options: Option<&HashMap>, + ) -> Result { + Ok(format!("{}${}", url.scheme(), url.authority())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_goosefs_store_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/data/embeddings.lance").unwrap(); + let path = provider.extract_path(&url).unwrap(); + // extract_path returns empty because the full path is used as OpenDAL root + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_goosefs_store_root_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/").unwrap(); + let path = provider.extract_path(&url).unwrap(); + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_goosefs_store_deep_path() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://master:9200/a/b/c/d.lance").unwrap(); + let path = provider.extract_path(&url).unwrap(); + // All path components are in the OpenDAL root, extract_path is empty + assert_eq!(path.to_string(), ""); + } + + #[test] + fn test_calculate_object_store_prefix() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let prefix = provider.calculate_object_store_prefix(&url, None).unwrap(); + assert_eq!(prefix, "goosefs$10.0.0.1:9200"); + } + + #[test] + fn test_calculate_object_store_prefix_with_hostname() { + let provider = GooseFsStoreProvider; + + let url = Url::parse("goosefs://myhost:9200/data").unwrap(); + let prefix = provider.calculate_object_store_prefix(&url, None).unwrap(); + assert_eq!(prefix, "goosefs$myhost:9200"); + } + + #[test] + fn test_resolve_master_addr_from_url() { + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let storage_options = StorageOptions(HashMap::new()); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.1:9200"); + } + + #[test] + fn test_resolve_master_addr_default_port() { + let url = Url::parse("goosefs://10.0.0.1/data").unwrap(); + let storage_options = StorageOptions(HashMap::new()); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.1:9200"); + } + + #[test] + fn test_resolve_master_addr_from_storage_options() { + let url = Url::parse("goosefs://10.0.0.1:9200/data").unwrap(); + let storage_options = StorageOptions(HashMap::from([( + "goosefs_master_addr".to_string(), + "10.0.0.2:9200,10.0.0.3:9200".to_string(), + )])); + let addr = GooseFsStoreProvider::resolve_master_addr(&url, &storage_options).unwrap(); + assert_eq!(addr, "10.0.0.2:9200,10.0.0.3:9200"); + } +} diff --git a/rust/lance-io/tests/goosefs_integration.rs b/rust/lance-io/tests/goosefs_integration.rs new file mode 100644 index 00000000000..fd9015bcda0 --- /dev/null +++ b/rust/lance-io/tests/goosefs_integration.rs @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! GooseFS integration tests via OpenDAL. +//! +//! Covers Stage 2 (OpenDAL direct), Stage 3 (Lance ObjectStore I/O), +//! and diagnostic tests (OpenDAL via lance-io ObjectStore). +//! +//! Run: +//! cargo test -p lance-io --features "goosefs goosefs-test" --test goosefs_integration -- --ignored --nocapture --test-threads=1 +#![cfg(feature = "goosefs-test")] +#![allow(clippy::print_stderr)] + +use std::sync::Arc; + +use futures::TryStreamExt; +use object_store::ObjectStoreExt; +use opendal::{Operator, services::GooseFs}; +use std::collections::HashMap; + +fn get_operator() -> Operator { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or("127.0.0.1:9200".into()); + let auth_type = std::env::var("GOOSEFS_AUTH_TYPE").unwrap_or("simple".into()); + let mut cfg = HashMap::new(); + cfg.insert("master_addr".to_string(), addr); + cfg.insert("root".to_string(), "/lance-test/opendal".to_string()); + cfg.insert("auth_type".to_string(), auth_type); + Operator::from_iter::(cfg).unwrap().finish() +} + +// ============================================================ +// Stage 2: OpenDAL GooseFs Service tests +// ============================================================ + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_write_read() { + let op = get_operator(); + // Cleanup any leftover from previous runs + let _ = op.delete("hello.txt").await; + op.write("hello.txt", "Hello from OpenDAL").await.unwrap(); + let data = op.read("hello.txt").await.unwrap(); + assert_eq!(data.to_vec(), b"Hello from OpenDAL"); + op.delete("hello.txt").await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_list() { + let op = get_operator(); + // Write files directly (GooseFS may have h2 issues with newly-created subdirs) + let _ = op.delete("list_a.txt").await; + let _ = op.delete("list_b.txt").await; + op.write("list_a.txt", "aaa").await.unwrap(); + op.write("list_b.txt", "bbb").await.unwrap(); + let entries: Vec<_> = op.list("/").await.unwrap(); + let names: Vec = entries.iter().map(|e| e.name().to_string()).collect(); + eprintln!("Listed entries: {:?}", names); + assert!( + entries.len() >= 2, + "Expected at least 2 entries, got {}", + entries.len() + ); + op.delete("list_a.txt").await.unwrap(); + op.delete("list_b.txt").await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_opendal_stat() { + let op = get_operator(); + // Cleanup leftover from previous runs + let _ = op.delete("stat_test.txt").await; + op.write("stat_test.txt", "12345").await.unwrap(); + let meta = op.stat("stat_test.txt").await.unwrap(); + assert_eq!(meta.content_length(), 5); + op.delete("stat_test.txt").await.unwrap(); +} + +// ============================================================ +// Stage 3: Lance ObjectStore I/O tests +// ============================================================ + +use lance_io::object_store::ObjectStore; + +async fn get_lance_store() -> Arc { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or("127.0.0.1:9200".into()); + let uri = format!("goosefs://{}/lance-test/lance-io", addr); + ObjectStore::from_uri(&uri).await.unwrap().0 +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_put_get() { + let store = get_lance_store().await; + let path = object_store::path::Path::from("test_put_get.bin"); + + // Cleanup + let _ = store.inner.delete(&path).await; + + // Write + store + .inner + .put(&path, (&b"lance-goosefs-test"[..]).into()) + .await + .unwrap(); + + // Read + let result = store.inner.get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + assert_eq!(&bytes[..], b"lance-goosefs-test"); + + // Cleanup + store.inner.delete(&path).await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_list() { + let store = get_lance_store().await; + + let file_a = object_store::path::Path::from("list_a.bin"); + let file_b = object_store::path::Path::from("list_b.bin"); + + // Cleanup leftovers + let _ = store.inner.delete(&file_a).await; + let _ = store.inner.delete(&file_b).await; + + store + .inner + .put(&file_a, (&b"aaa"[..]).into()) + .await + .unwrap(); + store + .inner + .put(&file_b, (&b"bbb"[..]).into()) + .await + .unwrap(); + + let entries: Vec<_> = store.inner.list(None).try_collect().await.unwrap(); + eprintln!("Listed {} entries", entries.len()); + assert!( + entries.len() >= 2, + "Expected at least 2 entries, got {}", + entries.len() + ); + + store.inner.delete(&file_a).await.unwrap(); + store.inner.delete(&file_b).await.unwrap(); +} + +#[ignore = "Requires GooseFS cluster"] +#[tokio::test] +async fn test_lance_objectstore_large_file() { + let store = get_lance_store().await; + let path = object_store::path::Path::from("large_file.bin"); + let _ = store.inner.delete(&path).await; + + // Write 5MB file + let data = vec![42u8; 5 * 1024 * 1024]; + store.inner.put(&path, data.clone().into()).await.unwrap(); + + let result = store.inner.get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + assert_eq!(bytes.len(), 5 * 1024 * 1024); + assert_eq!(&bytes[..10], &[42u8; 10]); + + store.inner.delete(&path).await.unwrap(); +} + +// ============================================================ +// Diagnostic: lance-io ObjectStore advanced write modes +// ============================================================ + +use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; + +#[tokio::test] +#[ignore = "Requires GooseFS cluster"] +async fn test_diag_lance_io_write_modes() { + let addr = std::env::var("GOOSEFS_MASTER_ADDR").unwrap_or_else(|_| "127.0.0.1:9200".into()); + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis(); + let root = format!("goosefs://{}/lance-test/lance_io_direct_{}", addr, ts); + + eprintln!("[DIAG] Creating ObjectStore at: {}", root); + + let params = ObjectStoreParams::default(); + let registry = Arc::new(ObjectStoreRegistry::default()); + let (object_store, _path) = ObjectStore::from_uri_and_params(registry, &root, ¶ms) + .await + .expect("Failed to create ObjectStore"); + + // Test 1: Basic put + get + let test_path = object_store::path::Path::parse("test_file.txt").unwrap(); + let test_data = bytes::Bytes::from("Hello from lance-io ObjectStore!"); + + eprintln!( + "[DIAG] Writing test_file.txt ({} bytes)...", + test_data.len() + ); + match object_store + .inner + .put(&test_path, test_data.clone().into()) + .await + { + Ok(_) => eprintln!("[DIAG] Write succeeded! ✅"), + Err(e) => { + eprintln!("[DIAG] Write FAILED: {:?}", e); + eprintln!("[DIAG] Error source: {:?}", std::error::Error::source(&e)); + return; + } + } + + eprintln!("[DIAG] Reading test_file.txt..."); + match object_store.inner.get(&test_path).await { + Ok(result) => { + let bytes = result.bytes().await.unwrap(); + let content = String::from_utf8_lossy(&bytes); + eprintln!("[DIAG] Read content: '{}' ({} bytes)", content, bytes.len()); + assert_eq!(bytes, test_data); + } + Err(e) => eprintln!("[DIAG] Read FAILED: {:?}", e), + } + + // Test 2: PutMode::Create (if_not_exists) + eprintln!("[DIAG] Writing with PutMode::Create (if_not_exists)..."); + match object_store + .inner + .put_opts( + &object_store::path::Path::parse("test_create.txt").unwrap(), + bytes::Bytes::from("conditional write!").into(), + object_store::PutOptions { + mode: object_store::PutMode::Create, + ..Default::default() + }, + ) + .await + { + Ok(_) => eprintln!("[DIAG] PutMode::Create succeeded! ✅"), + Err(e) => { + eprintln!("[DIAG] PutMode::Create FAILED: {:?}", e); + } + } + + // Test 3: rename_if_not_exists + eprintln!("[DIAG] Testing rename_if_not_exists..."); + let tmp_path = object_store::path::Path::parse("_tmp_rename.txt").unwrap(); + let dest_path = object_store::path::Path::parse("renamed.txt").unwrap(); + match object_store + .inner + .put(&tmp_path, bytes::Bytes::from("rename me!").into()) + .await + { + Ok(_) => { + eprintln!("[DIAG] Tmp file written ✅"); + match object_store + .inner + .rename_if_not_exists(&tmp_path, &dest_path) + .await + { + Ok(_) => eprintln!("[DIAG] rename_if_not_exists succeeded! ✅"), + Err(e) => eprintln!("[DIAG] rename_if_not_exists FAILED: {:?}", e), + } + } + Err(e) => eprintln!("[DIAG] Tmp file write FAILED: {:?}", e), + } + + eprintln!("[DIAG] lance-io direct write test complete ✅"); +} diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 963edf5e8ca..53ff79fb333 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -21,6 +21,7 @@ dir-aws = ["lance-io/aws", "lance/aws"] dir-azure = ["lance-io/azure", "lance/azure"] dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] +dir-goosefs = ["lance-io/goosefs", "lance/goosefs"] # Credential vending features credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:sha2", "dep:base64"] credential-vendor-gcp = ["dep:reqwest", "dep:serde", "dep:sha2", "dep:base64", "dep:ring", "dep:rustls-pki-types"] @@ -84,13 +85,17 @@ hmac = { version = "0.12", optional = true } quick-xml = { version = "0.38", optional = true } [dev-dependencies] +opendal = { workspace = true, features = ["services-goosefs"] } tokio = { workspace = true, features = ["full"] } tempfile.workspace = true wiremock.workspace = true arrow = { workspace = true } +arrow-array = { workspace = true } arrow-ipc = { workspace = true } rstest.workspace = true lance-table.workspace = true +lance-arrow = { workspace = true } +lance = { workspace = true } [lints] workspace = true diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 95cc11e94a3..e1d9182b24d 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -104,6 +104,7 @@ prost-build.workspace = true protobuf-src = { version = "2.1", optional = true } [target.'cfg(target_os = "linux")'.dev-dependencies] +pprof.workspace = true # Need this so we can prevent dynamic linking in binaries (see cli feature) lzma-sys = { version = "0.1" } @@ -138,7 +139,7 @@ parquet = { version = "58", default-features = false, features = ["arrow", "asyn reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } [features] -default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "tos", "geo"] +default = ["aws", "azure", "gcp", "oss", "huggingface", "tencent", "tos", "goosefs", "geo"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] @@ -157,8 +158,9 @@ gcp = ["lance-io/gcp"] azure = ["lance-io/azure"] oss = ["lance-io/oss"] tencent = ["lance-io/tencent"] -huggingface = ["lance-io/huggingface"] +goosefs = ["lance-io/goosefs"] tos = ["lance-io/tos"] +huggingface = ["lance-io/huggingface"] geo = ["lance-datafusion/geo", "lance-index/geo"] # Enable slow integration tests (disabled by default in CI) slow_tests = [] From 6b6c4866e5dda7c63742b33faabff18a99be3b99 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Fri, 5 Jun 2026 11:50:49 +0000 Subject: [PATCH 036/177] chore: release beta version 8.0.0-beta.4 --- .bumpversion.toml | 2 +- Cargo.lock | 46 +++++++++++++++++++-------------------- Cargo.toml | 42 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 38 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 38 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 86 insertions(+), 86 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 3c10a0473a8..e474bb77fea 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.3" +current_version = "8.0.0-beta.4" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index ff70a95ae1a..28c88cd2577 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4479,7 +4479,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "all_asserts", "approx", @@ -4583,7 +4583,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4631,7 +4631,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrayref", "paste", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4677,7 +4677,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4710,7 +4710,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4730,7 +4730,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4775,7 +4775,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "all_asserts", "arrow", @@ -4801,7 +4801,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4841,7 +4841,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "datafusion", "geo-traits", @@ -4855,7 +4855,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "approx", "arc-swap", @@ -4934,7 +4934,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-arith", @@ -4983,7 +4983,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "approx", "arrow-array", @@ -5003,7 +5003,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "async-trait", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-schema", @@ -5031,7 +5031,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -5090,7 +5090,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -5109,7 +5109,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -5156,7 +5156,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "proc-macro2", "quote", @@ -5165,7 +5165,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-schema", @@ -5178,7 +5178,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5190,7 +5190,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 3140866590b..e28823329a4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.3", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.3", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.3", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.3", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.3", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.3", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.3", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.3", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.3", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.3", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.3", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.3", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.4", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.4", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.4", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.4", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.4", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.4", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.4", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.4", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.4", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.4", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.4", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.4", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.4", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.3", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.3", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.3", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.3", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.3", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.4", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.4", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.4", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.4", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.4", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.3", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.4", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.3", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.4", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 8dde29c552a..bb1554398c7 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3770,7 +3770,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arc-swap", "arrow", @@ -3844,7 +3844,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -3886,7 +3886,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrayref", "paste", @@ -3895,7 +3895,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -3930,7 +3930,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -3962,7 +3962,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4046,7 +4046,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "datafusion", "geo-traits", @@ -4060,7 +4060,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arc-swap", "arrow", @@ -4130,7 +4130,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-arith", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "async-trait", @@ -4236,7 +4236,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-ipc", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4334,7 +4334,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index e8bf842dbdc..3ce5102326d 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 68390ec4128..0a55a78f47d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.3 + 8.0.0-beta.4 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 908c2a21423..5f71bae7fe0 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4136,7 +4136,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arc-swap", "arrow", @@ -4211,7 +4211,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4253,7 +4253,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrayref", "paste", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4329,7 +4329,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4347,7 +4347,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4382,7 +4382,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-arith", "arrow-array", @@ -4413,7 +4413,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "datafusion", "geo-traits", @@ -4427,7 +4427,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arc-swap", "arrow", @@ -4498,7 +4498,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-arith", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4556,7 +4556,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "async-trait", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-ipc", @@ -4612,7 +4612,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", @@ -4668,7 +4668,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6156,7 +6156,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index e96e8862329..79b603bffd3 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.3" +version = "8.0.0-beta.4" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From cb9a782882da19d1f0c41b5fb6b80c680442df5f Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Fri, 5 Jun 2026 10:39:54 -0500 Subject: [PATCH 037/177] fix(mem-wal): surface fenced flush to durability waiters instead of hanging (#7132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem A `durable_write` put waits on a `BatchDurableWatcher`, which only resolves when the durability watermark reaches its target batch position. The watermark is advanced **exclusively on a successful** WAL flush (`flush_from_batch_store`). When a flush fails terminally — a successor has claimed a higher epoch and fenced this writer — the append returns `Err` before the watermark advances, the background dispatcher logs and swallows the error (by design, so a transient flush failure doesn't deadlock puts), and the watch channel is never closed. The watcher's `wait()` therefore **loops forever**: the watermark can never reach the target and the channel never closes. The put hangs until the client times out. This became reachable with the fence-on-claim sentinel (#7110): a fenced predecessor's next flush now collides with the sentinel and fails, where it previously false-acked into an empty slot and advanced the watermark. Surfaced downstream as a WAL server's `merge_insert` HTTP handler hanging (client `operation timed out`) after a rolling-restart pod replacement, instead of returning a terminal verdict. ## Fix Share a terminal-error slot between `WalFlusher` and every `BatchDurableWatcher` it hands out. On a fence error, `flush` records the error and wakes all waiters (via `send_modify`, without advancing the watermark); `wait()` re-checks the slot and returns the fence error. A fence is permanent, so failing fast is correct — and the error text is preserved so callers can map it to a terminal verdict (e.g. HTTP 410). Only fences poison the slot; transient flush failures keep the existing log-and-retry-on-next-flush behavior. ## Test `test_durable_watcher_aborts_on_fence_instead_of_hanging`: a predecessor's durable flush collides with a successor's fence sentinel; the watcher resolves with the fence error under a timeout rather than blocking forever. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 --- rust/lance/src/dataset/mem_wal/wal.rs | 122 +++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 4 deletions(-) diff --git a/rust/lance/src/dataset/mem_wal/wal.rs b/rust/lance/src/dataset/mem_wal/wal.rs index 7b95d94305a..afccfbc0979 100644 --- a/rust/lance/src/dataset/mem_wal/wal.rs +++ b/rust/lance/src/dataset/mem_wal/wal.rs @@ -43,6 +43,13 @@ pub const WRITER_EPOCH_KEY: &str = "writer_epoch"; /// replay skips sentinels via their empty batch list). pub const FENCE_SENTINEL_KEY: &str = "fence_sentinel"; +/// True if `error` is the terminal fence emitted by `ManifestStore::check_fenced` +/// (a successor claimed a higher epoch). Matches the message it formats, since +/// fences surface as a plain `Error::io` rather than a typed variant. +fn is_fence_error(error: &Error) -> bool { + error.to_string().contains("Writer fenced") +} + /// Watcher for batch durability using watermark-based tracking. /// /// Uses a shared watch channel that broadcasts the durable watermark. @@ -53,22 +60,36 @@ pub struct BatchDurableWatcher { rx: watch::Receiver, /// Target batch ID to wait for. target_batch_position: usize, + /// Terminal flush failure (e.g. a fence) shared with the flusher. When + /// set, the watermark will never advance to the target, so `wait` + /// returns this error instead of blocking forever. + terminal_error: Arc>>, } impl BatchDurableWatcher { /// Create a new watcher for a specific batch ID. - pub fn new(rx: watch::Receiver, target_batch_position: usize) -> Self { + pub fn new( + rx: watch::Receiver, + target_batch_position: usize, + terminal_error: Arc>>, + ) -> Self { Self { rx, target_batch_position, + terminal_error, } } /// Wait until the batch is durable. /// - /// Returns Ok(()) when `durable_watermark >= target_batch_position`. + /// Returns Ok(()) when `durable_watermark >= target_batch_position`, or + /// Err if a terminal flush failure (e.g. a fence) means the watermark can + /// never reach the target. pub async fn wait(&mut self) -> Result<()> { loop { + if let Some(msg) = self.terminal_error.lock().unwrap().clone() { + return Err(Error::io(msg)); + } let current = *self.rx.borrow(); if current >= self.target_batch_position { return Ok(()); @@ -317,6 +338,11 @@ pub struct WalFlusher { /// Created at construction and recreated after each flush. /// Used by backpressure to wait for WAL flushes. wal_flush_cell: std::sync::Mutex>>, + /// First terminal flush failure (a fence). Shared with every + /// `BatchDurableWatcher` so a fenced flush — which never advances the + /// watermark — wakes durability waiters with the error instead of + /// hanging them forever. + terminal_error: Arc>>, } impl WalFlusher { @@ -338,6 +364,7 @@ impl WalFlusher { shard_id, flush_tx: None, wal_flush_cell: std::sync::Mutex::new(Some(wal_flush_cell)), + terminal_error: Arc::new(StdMutex::new(None)), } } @@ -358,7 +385,27 @@ impl WalFlusher { pub fn track_batch(&self, batch_position: usize) -> BatchDurableWatcher { // Return a watcher that waits for this batch to become durable // batch_position is 0-indexed, so we wait for watermark > batch_position (i.e., >= batch_position + 1) - BatchDurableWatcher::new(self.durable_watermark_rx.clone(), batch_position + 1) + BatchDurableWatcher::new( + self.durable_watermark_rx.clone(), + batch_position + 1, + Arc::clone(&self.terminal_error), + ) + } + + /// Record a terminal flush failure (a fence) and wake every pending + /// durability waiter. A fence is permanent — the watermark will never + /// advance — so waiters must observe the error rather than block forever. + /// Idempotent: only the first failure is retained. + fn mark_terminal_failure(&self, error: &Error) { + { + let mut slot = self.terminal_error.lock().unwrap(); + if slot.is_none() { + *slot = Some(error.to_string()); + } + } + // Wake `wait`ers without advancing the watermark; each re-checks + // `terminal_error` and returns the error. + self.durable_watermark_tx.send_modify(|_| {}); } /// Get the current durable watermark. @@ -431,7 +478,7 @@ impl WalFlusher { source: &WalFlushSource, end_batch_position: usize, ) -> Result { - match source { + let result = match source { WalFlushSource::BatchStore { batch_store, indexes, @@ -440,7 +487,16 @@ impl WalFlusher { .await } WalFlushSource::WalOnly { state } => self.flush_from_wal_only(state).await, + }; + // A fence is terminal: the append will never succeed, so the + // durability watermark can never advance. Wake any waiter (e.g. a + // `durable_write` put) with the fence error instead of hanging it. + if let Err(e) = &result + && is_fence_error(e) + { + self.mark_terminal_failure(e); } + result } async fn flush_from_batch_store( @@ -1700,6 +1756,64 @@ mod tests { assert_eq!(res.entry_position, 3); } + // Regression: a fenced WAL flush never advances the durability watermark. + // A `durable_write` put waits on a `BatchDurableWatcher`, so without + // terminal-failure propagation the watcher blocks forever (the predecessor + // pod's HTTP write hangs until the client times out). The flusher must + // surface the fence through the watcher so the caller fails fast with 410. + #[tokio::test] + async fn test_durable_watcher_aborts_on_fence_instead_of_hanging() { + let (store, base_path, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let schema = create_test_schema(); + + // Predecessor claims epoch 1 and writes one entry (position 1), seeding + // its cached next position at 2. The flusher shares this appender. + let first = Arc::new( + WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(), + ); + assert_eq!(first.writer_epoch(), 1); + first + .append(vec![create_test_batch(&schema, 1)]) + .await + .unwrap(); + let flusher = WalFlusher::new(Arc::clone(&first)); + + // Successor claims epoch 2 and drops a sentinel at the predecessor's + // next slot (position 2) — a rolling-restart pod replacement. + let second = WalAppender::open(store.clone(), base_path.clone(), shard_id, 0) + .await + .unwrap(); + assert_eq!(second.writer_epoch(), 2); + assert_eq!(second.write_fence_sentinel().await.unwrap(), 2); + + // A durable put on the predecessor: stage a batch and track it. + let batch_store = Arc::new(BatchStore::with_capacity(10)); + batch_store.append(create_test_batch(&schema, 1)).unwrap(); + let mut watcher = flusher.track_batch(0); + + // Flushing collides with the sentinel and fences. Both the flush result + // and the watcher must report the fence — and the watcher must resolve + // promptly, not block on a watermark that can never advance. + let source = batch_store_source(&batch_store); + let flush_err = flusher.flush(&source, batch_store.len()).await.unwrap_err(); + assert!( + is_fence_error(&flush_err), + "expected fence error from flush, got: {flush_err}" + ); + + let waited = tokio::time::timeout(std::time::Duration::from_secs(5), watcher.wait()).await; + let err = waited + .expect("watcher.wait() hung after a fenced flush") + .expect_err("watcher must surface the fence, not report success"); + assert!( + is_fence_error(&err), + "watcher must report the fence so the HTTP layer maps 410, got: {err}" + ); + } + #[tokio::test] async fn test_wal_appender_rejects_invalid_input() { let (store, base_path, _temp_dir) = create_local_store().await; From 9b46abeef7fa2da43916a2b0dbcb0c3ffbffa061 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Fri, 5 Jun 2026 15:43:34 +0000 Subject: [PATCH 038/177] chore: release beta version 8.0.0-beta.5 --- .bumpversion.toml | 2 +- Cargo.lock | 46 +++++++++++++++++++-------------------- Cargo.toml | 42 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 38 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 38 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 86 insertions(+), 86 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index e474bb77fea..5b36554af95 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.4" +current_version = "8.0.0-beta.5" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 28c88cd2577..03934fe744f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4479,7 +4479,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "all_asserts", "approx", @@ -4583,7 +4583,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4631,7 +4631,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrayref", "paste", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4677,7 +4677,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4710,7 +4710,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4730,7 +4730,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4775,7 +4775,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "all_asserts", "arrow", @@ -4801,7 +4801,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4841,7 +4841,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "datafusion", "geo-traits", @@ -4855,7 +4855,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "approx", "arc-swap", @@ -4934,7 +4934,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-arith", @@ -4983,7 +4983,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "approx", "arrow-array", @@ -5003,7 +5003,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "async-trait", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-schema", @@ -5031,7 +5031,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -5090,7 +5090,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -5109,7 +5109,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -5156,7 +5156,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "proc-macro2", "quote", @@ -5165,7 +5165,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-schema", @@ -5178,7 +5178,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5190,7 +5190,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index e28823329a4..5b94e2adfa3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.4", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.4", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.4", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.4", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.4", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.4", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.4", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.4", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.4", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.4", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.4", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.4", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.4", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.5", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.5", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.5", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.5", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.5", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.5", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.5", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.5", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.5", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.5", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.5", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.5", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.5", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.4", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.4", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.4", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.4", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.4", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.5", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.5", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.5", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.5", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.5", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.4", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.5", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.4", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.5", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index bb1554398c7..1e5a23e153e 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3770,7 +3770,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arc-swap", "arrow", @@ -3844,7 +3844,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -3886,7 +3886,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrayref", "paste", @@ -3895,7 +3895,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -3930,7 +3930,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -3962,7 +3962,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4046,7 +4046,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "datafusion", "geo-traits", @@ -4060,7 +4060,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arc-swap", "arrow", @@ -4130,7 +4130,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-arith", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "async-trait", @@ -4236,7 +4236,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-ipc", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4334,7 +4334,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 3ce5102326d..723a213bac2 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 0a55a78f47d..fbedf563249 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.4 + 8.0.0-beta.5 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 5f71bae7fe0..86fed6f38c6 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4136,7 +4136,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arc-swap", "arrow", @@ -4211,7 +4211,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4253,7 +4253,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrayref", "paste", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4329,7 +4329,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4347,7 +4347,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4382,7 +4382,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-arith", "arrow-array", @@ -4413,7 +4413,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "datafusion", "geo-traits", @@ -4427,7 +4427,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arc-swap", "arrow", @@ -4498,7 +4498,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-arith", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4556,7 +4556,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "async-trait", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-ipc", @@ -4612,7 +4612,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", @@ -4668,7 +4668,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6156,7 +6156,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index 79b603bffd3..654872ee863 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.4" +version = "8.0.0-beta.5" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 4441d15951cfa73988caff3747f5146f468d2672 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Fri, 5 Jun 2026 09:10:47 -0700 Subject: [PATCH 039/177] fix(linalg): reduce cosine bench TOTAL to avoid FixedSizeBinaryArray i32 overflow (#7116) BFloat16Type arrays are backed by FixedSizeBinaryArray (2 bytes/element). With TOTAL=1M and DIMENSION=1024 the allocation reached 2*1024^3 bytes, exceeding Arrow's i32 offset limit of 2^31-1 and causing a panic. Fixes #7113 Co-authored-by: Claude Sonnet 4.6 --- rust/lance-linalg/benches/cosine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance-linalg/benches/cosine.rs b/rust/lance-linalg/benches/cosine.rs index 223299a934c..49e816b76df 100644 --- a/rust/lance-linalg/benches/cosine.rs +++ b/rust/lance-linalg/benches/cosine.rs @@ -40,7 +40,7 @@ where T::Native: Cosine, { const DIMENSION: usize = 1024; - const TOTAL: usize = 1024 * 1024; // 1M vectors + const TOTAL: usize = 512 * 1024; let type_name = std::any::type_name::(); let key = generate_random_array_with_seed::(DIMENSION, [0; 32]); From df94ee64b03e94eb485d9cfcd0e662dd34645de8 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 5 Jun 2026 14:52:30 -0700 Subject: [PATCH 040/177] feat: expose tracked_files and all_files on LanceDataset (#6011) Adds new `tracked_files()` and `all_files()` methods that return data about files in a table. Both return as Arrow data. `tracked_files()` outputs a row for every file referenced by each version. Files that are referenced by multiple versions (such as a data file) have a row for each version. This has columns for `base_uri`, `version`, `path`, and `file_type`. `all_files()` outputs a row for every file in the dataset root directory, whether or not they are part of the table. This has columns for `base_uri`, `path`, `file_size`, `last_modified`. These two data streams can be used in combination to do deeper analysis on file structure of a table. It can answer questions like: How much of the storage space is taken up by untracked files? When were untracked files created? Which files are taking up the most space? How big is version X? --------- Co-authored-by: Claude --- python/python/lance/dataset.py | 59 + python/python/tests/test_dataset.py | 31 +- python/src/dataset.rs | 48 + rust/lance-table/src/utils.rs | 2 + rust/lance/src/dataset.rs | 1 + rust/lance/src/dataset/files.rs | 1169 ++++++++++++++++++++ rust/lance/src/dataset/files/arrow.rs | 128 +++ rust/lance/src/dataset/files/file_types.rs | 33 + 8 files changed, 1470 insertions(+), 1 deletion(-) create mode 100644 rust/lance/src/dataset/files.rs create mode 100644 rust/lance/src/dataset/files/arrow.rs create mode 100644 rust/lance/src/dataset/files/file_types.rs diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 931a09d6ce6..ccd44e6dcaf 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -5067,6 +5067,65 @@ def mem_wal_writer( raw = self._ds.mem_wal_writer(shard_id, **kwargs) return _mw.ShardWriter(raw) + def tracked_files( + self, + *, + min_version: Optional[int] = None, + progress: Optional[Callable] = None, + ) -> pa.RecordBatchReader: + """Stream all files referenced by any manifest version of this dataset. + + Parameters + ---------- + min_version : int, optional + If set, only include manifests with version >= min_version. + progress : callable, optional + Called after each manifest is processed with two arguments: + ``(manifests_processed: int, manifests_total: Optional[int])``. + ``manifests_total`` is ``None`` until all manifest locations + have been listed. Works well with ``tqdm``:: + + from tqdm import tqdm + pbar = tqdm(unit="manifest") + def on_progress(processed, total): + if total is not None: + pbar.total = total + pbar.update(1) + reader = ds.tracked_files(progress=on_progress) + table = reader.read_all() + pbar.close() + + Returns + ------- + pyarrow.RecordBatchReader + Schema: + + - **version** (int64): manifest version number + - **base_uri** (dictionary): storage root URI + - **path** (utf8): file path relative to ``base_uri`` + - **type** (dictionary): one of ``manifest``, + ``data file``, ``deletion file``, ``transaction file``, + ``index file`` + + Output order is non-deterministic. + """ + return self._ds.tracked_files(min_version=min_version, progress=progress) + + def all_files(self) -> pa.RecordBatchReader: + """Stream all files physically present at this dataset's base URI. + + Returns a :class:`pyarrow.RecordBatchReader` with schema: + + - **base_uri** (dictionary): storage root URI + - **path** (utf8): file path relative to ``base_uri`` + - **size_bytes** (int64): file size in bytes + - **last_modified** (timestamp[us, UTC]): last modification time + + Only the primary object store is scanned; alternate ``base_paths`` + entries are not included. + """ + return self._ds.all_files() + class SqlQuery: """ diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index b5c81669fa1..4af363868e1 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -5657,4 +5657,33 @@ def test_default_scan_options_nearest(tmp_path: Path) -> None: distances = result["_distance"].to_pylist() assert distances == sorted(distances) - assert "id" in result.column_names + +def test_tracked_files(tmp_path): + table = pa.table({"x": [1, 2, 3]}) + ds = lance.write_dataset(table, tmp_path / "ds") + ds.delete("x = 2") # adds a deletion file + + reader = ds.tracked_files() + assert isinstance(reader, pa.RecordBatchReader) + + result = reader.read_all() + assert result.schema.field("version").type == pa.int64() + assert result.num_rows >= 2 # at least manifest + data file + + types = set(result.column("type").to_pylist()) + assert "manifest" in types + assert "data file" in types + assert "deletion file" in types + + +def test_all_files(tmp_path): + table = pa.table({"x": [1, 2, 3]}) + ds = lance.write_dataset(table, tmp_path / "ds") + + reader = ds.all_files() + assert isinstance(reader, pa.RecordBatchReader) + + result = reader.read_all() + assert result.schema.field("size_bytes").type == pa.int64() + assert result.num_rows >= 2 # at least manifest + data file + assert all(s > 0 for s in result.column("size_bytes").to_pylist()) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 8c16ca85399..f9f6ed669b1 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -2910,6 +2910,54 @@ impl Dataset { Ok(PyArrowType(reader)) } + #[pyo3(signature = (*, min_version=None, progress=None))] + fn tracked_files( + &self, + min_version: Option, + progress: Option<&Bound<'_, PyAny>>, + ) -> PyResult>> { + use lance::dataset::files::{TrackedFilesOptions, TrackedFilesProgress}; + + let progress_cb: Option> = + if let Some(cb) = progress { + if !cb.is_callable() { + return Err(PyValueError::new_err("progress must be callable")); + } + let cb = cb.clone().unbind(); + Some(Box::new(move |p: TrackedFilesProgress| { + Python::attach(|py| { + let total: Option = p.manifests_total; + match cb.call1(py, (p.manifests_processed, total)) { + Ok(_) => (), + Err(e) => { + log::error!("Error in tracked_files progress callback: {}", e); + } + } + }); + })) + } else { + None + }; + + let options = TrackedFilesOptions { + min_version, + progress: progress_cb, + }; + let stream = rt().block_on(None, self.ds.tracked_files_with_options(options))?; + let reader = Box::new(LanceReader::from_stream(DatasetRecordBatchStream::new( + stream, + ))); + Ok(PyArrowType(reader)) + } + + fn all_files(&self) -> PyResult>> { + let stream = rt().block_on(None, self.ds.all_files())?; + let reader = Box::new(LanceReader::from_stream(DatasetRecordBatchStream::new( + stream, + ))); + Ok(PyArrowType(reader)) + } + #[pyo3(signature = (keys))] fn delete_config_keys(&mut self, keys: Vec) -> PyResult<()> { let mut new_self = self.ds.as_ref().clone(); diff --git a/rust/lance-table/src/utils.rs b/rust/lance-table/src/utils.rs index 01c64f78710..0c37ef1e001 100644 --- a/rust/lance-table/src/utils.rs +++ b/rust/lance-table/src/utils.rs @@ -45,3 +45,5 @@ impl Iterator for ExactSize { (self.size, Some(self.size)) } } + +impl ExactSizeIterator for ExactSize {} diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index e3f7a197e58..2e448dfa828 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -73,6 +73,7 @@ mod branch_location; pub mod builder; pub mod cleanup; pub mod delta; +pub mod files; pub mod fragment; mod hash_joiner; pub mod index; diff --git a/rust/lance/src/dataset/files.rs b/rust/lance/src/dataset/files.rs new file mode 100644 index 00000000000..848add7e4a8 --- /dev/null +++ b/rust/lance/src/dataset/files.rs @@ -0,0 +1,1169 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Dataset file inspection APIs. + +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use arrow_array::RecordBatch; +use arrow_array::builder::{ + Int64Builder, StringBuilder, StringDictionaryBuilder, TimestampMicrosecondBuilder, +}; +use arrow_array::types::Int32Type; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use either::Either; +use futures::stream::FuturesUnordered; +use futures::{Future, StreamExt, TryStreamExt}; +use lance_table::format::IndexMetadata; +use lance_table::utils::LanceIteratorExtension; +use object_store::path::Path; +use uuid::Uuid; + +use crate::Dataset; +use crate::dataset::files::arrow::{TRACKED_FILES_SCHEMA, TrackedFileBatch}; +use crate::dataset::files::file_types::FileType; +use crate::dataset::{DATA_DIR, INDICES_DIR, TRANSACTIONS_DIR}; +use lance_core::Result; +use lance_table::io::deletion::relative_deletion_file_path; +use lance_table::io::manifest::{read_manifest, read_manifest_indexes}; + +mod arrow; +mod file_types; + +const BATCH_SIZE: usize = 4096; +/// Memory budget for in-flight manifests (estimated in-memory size). +const MANIFEST_MEMORY_BUDGET: usize = 1024 * 1024 * 1024; // 1 GB +/// Estimated ratio of in-memory size to on-disk size for manifests. Found +/// empirically; manifests are protobuf with significant decompression and +/// allocator overhead once parsed. +const MANIFEST_DECOMPRESSION_RATIO: usize = 4; + +fn remove_prefix(path: &Path, prefix: &Path) -> Path { + match path.prefix_match(prefix) { + Some(parts) => Path::from_iter(parts), + None => path.clone(), + } +} + +/// A single row destined for the `tracked_files` output. +struct FileRow<'a> { + version: u64, + base_uri: Cow<'a, str>, + path: Cow<'a, str>, + file_type: FileType, +} + +/// Resolve the base URI a file lives under. Files referenced from a shallow +/// clone carry a `base_id` pointing into `manifest.base_paths`; otherwise they +/// live under this dataset's own `base_uri`. +fn resolve_base_uri<'a>( + manifest: &'a lance_table::format::Manifest, + base_id: Option, + base_uri: &'a str, +) -> &'a str { + base_id + .and_then(|id| manifest.base_paths.get(&id).map(|bp| bp.path.as_str())) + .unwrap_or(base_uri) +} + +fn manifest_file_rows<'a>( + manifest: &'a lance_table::format::Manifest, + base_uri: &'a str, + manifest_path: &'a str, +) -> Box> + Send + 'a> { + let mut files = 1; + let manifest_row = FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Borrowed(manifest_path), + file_type: FileType::Manifest, + }; + let iter = std::iter::once(manifest_row); + + let iter = if let Some(txn_file) = &manifest.transaction_file { + files += 1; + let txn_row = FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Owned(format!("{}/{}", TRANSACTIONS_DIR, txn_file)), + file_type: FileType::TransactionFile, + }; + Either::Left(iter.chain(std::iter::once(txn_row))) + } else { + Either::Right(iter) + }; + + for fragment in manifest.fragments.iter() { + files += fragment.files.len(); + + if fragment.deletion_file.is_some() { + files += 1; + } + } + + let data_files = manifest.fragments.iter().flat_map(move |fragment| { + fragment.files.iter().map(move |data_file| { + let effective_base_uri = resolve_base_uri(manifest, data_file.base_id, base_uri); + FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(effective_base_uri), + path: Cow::Owned(format!("{}/{}", DATA_DIR, data_file.path)), + file_type: FileType::DataFile, + } + }) + }); + + let deletion_files = manifest.fragments.iter().filter_map(|fragment| { + fragment.deletion_file.as_ref().map(|del_file| FileRow { + version: manifest.version, + base_uri: Cow::Borrowed(resolve_base_uri(manifest, del_file.base_id, base_uri)), + path: Cow::Owned(relative_deletion_file_path(fragment.id, del_file)), + file_type: FileType::DeletionFile, + }) + }); + + Box::new( + iter.chain(data_files) + .chain(deletion_files) + .exact_size(files), + ) +} + +fn manifest_file_batches<'a>( + manifest: &'a lance_table::format::Manifest, + base_uri: &'a str, + manifest_path: &'a str, +) -> Box> + Send + 'a> { + let mut builder = TrackedFileBatch::with_capacity(BATCH_SIZE); + + let mut iter = manifest_file_rows(manifest, base_uri, manifest_path); + let size = iter.len().div_ceil(BATCH_SIZE); + + let mut flushed = false; + Box::new( + std::iter::from_fn(move || { + if flushed { + return None; + } + while let Some(row) = iter.next() { + builder.append(&row); + if builder.len() == BATCH_SIZE { + let next_size = iter.len().div_ceil(BATCH_SIZE); + let old_builder = + std::mem::replace(&mut builder, TrackedFileBatch::with_capacity(next_size)); + return Some(old_builder.finish()); + } + } + // Flush the remaining partial batch. + flushed = true; + if builder.len() != 0 { + let partial = std::mem::replace(&mut builder, TrackedFileBatch::with_capacity(0)); + Some(partial.finish()) + } else { + None + } + }) + .exact_size(size), + ) +} + +async fn get_index_files( + uuids: impl IntoIterator, + base: &Path, + object_store: &lance_io::object_store::ObjectStore, + cache: &mut HashMap>, +) -> Result> { + let uuids: Vec = uuids.into_iter().collect(); + + // Phase 1: list uncached UUID directories concurrently. + let uncached: Vec = uuids + .iter() + .filter(|uuid| !cache.contains_key(*uuid)) + .copied() + .collect(); + if !uncached.is_empty() { + let parallelism = object_store.io_parallelism(); + // Clone for use in async move closures (ObjectStore is Arc-backed). + let base_owned = base.clone(); + let os = object_store.clone(); + let new_entries: Vec<(Uuid, Vec)> = + futures::stream::iter(uncached) + .map(|uuid| { + let base = base_owned.clone(); + let os = os.clone(); + async move { + let prefix = base.join(INDICES_DIR).join(uuid.to_string()); + let files: Vec = + os.list(Some(prefix)).try_collect().await?; + lance_core::Result::Ok((uuid, files)) + } + }) + .buffer_unordered(parallelism) + .try_collect() + .await?; + + // Phase 2: insert results into cache (serial, no contention). + cache.extend(new_entries); + } + + // Phase 3: collect paths for the requested UUIDs in order. + let mut paths = Vec::new(); + for uuid in &uuids { + paths.extend( + cache[uuid] + .iter() + .map(|meta| remove_prefix(&meta.location, base)), + ); + } + Ok(paths) +} + +async fn index_file_batch(version: u64, base_uri: &str, paths: &[Path]) -> Result { + let mut builder = TrackedFileBatch::with_capacity(paths.len()); + for path in paths { + builder.append(&FileRow { + version, + base_uri: Cow::Borrowed(base_uri), + path: Cow::Owned(path.to_string()), + file_type: FileType::IndexFile, + }); + } + builder.finish() +} + +/// Progress update for [`Dataset::tracked_files_with_options`]. +#[derive(Debug, Clone)] +pub struct TrackedFilesProgress { + /// Number of manifests processed so far. + pub manifests_processed: usize, + /// Total number of manifests, if known. This becomes `Some` once the + /// listing stream is exhausted; until then it is `None`. + pub manifests_total: Option, +} + +/// Options for [`Dataset::tracked_files_with_options`]. +#[derive(Default)] +pub struct TrackedFilesOptions { + /// If set, only include manifests with `version >= min_version`. + pub min_version: Option, + /// If set, called each time a manifest has been fully processed. The + /// callback runs on a background tokio task, so it must not block (it + /// will stall the manifest reader pipeline). Order is the order in which + /// manifests finish processing, which is not the version order. + pub progress: Option>, +} + +// A `ManifestLocation` is ~100 bytes, so a 50k-slot mpsc channel costs ~5 MB +// in the worst case. That's enough headroom for the lister to run well ahead +// of the reader on datasets with hundreds of thousands of manifests, while +// still bounding memory. +const MAX_BUFFERED_LOCATIONS: usize = 50_000; + +impl Dataset { + /// Returns one row per (version, file) for every file referenced in any manifest. + /// + /// Each row contains the manifest version, the storage root URI, the file path + /// relative to that URI, and the file type. + /// + /// # Schema + /// + /// | Column | Type | Notes | + /// |------------|-----------------------------------|-------| + /// | `version` | `Int64` (non-null) | Manifest version number | + /// | `base_uri` | `Dictionary(Int32, Utf8)` (non-null) | Storage root for this file | + /// | `path` | `Utf8` (non-null) | Relative to `base_uri` | + /// | `type` | `Dictionary(Int8, Utf8)` (non-null) | One of: `data file`, `manifest`, `deletion file`, `transaction file`, `index file` | + /// + /// Output order is non-deterministic. + pub async fn tracked_files(&self) -> SendableRecordBatchStream { + self.tracked_files_with_options(TrackedFilesOptions::default()) + .await + } + + /// Like [`Self::tracked_files`], but with additional options for filtering + /// and progress reporting. + pub async fn tracked_files_with_options( + &self, + options: TrackedFilesOptions, + ) -> SendableRecordBatchStream { + use lance_table::io::commit::ManifestLocation; + + let base = self.base.clone(); + let uri = self.uri().to_string(); + let object_store = self.object_store.clone(); + let commit_handler = self.commit_handler.clone(); + + // Pipeline architecture: + // + // Lister ──► tx_locations ──► Reader ──┬──► tx_manifest ──► Emitter ──► tx (output) + // └──► tx_indexes ──► IndexLister ──► tx (output) + + // Output channel: Emitter and IndexLister both send batches here. + let (tx, rx) = tokio::sync::mpsc::channel::>(4); + // Location channel: Lister -> Reader. Large buffer since locations are + // small (~100 bytes each) and we want the lister to run ahead. + let (tx_locations, mut rx_locations) = + tokio::sync::mpsc::channel::(MAX_BUFFERED_LOCATIONS); + // Manifest channel: Reader -> Emitter (small buffer for backpressure + // since manifests can be large). + let (tx_manifest, mut rx_manifest) = + tokio::sync::mpsc::channel::<(Arc, String, usize)>(2); + // Index channel: Reader -> IndexLister. + let (tx_indexes, mut rx_indexes) = + tokio::sync::mpsc::channel::<(u64, Vec)>(8); + + // Tracks estimated in-memory size of in-flight manifests. Reader adds + // before sending; Emitter subtracts after processing. + let inflight_mem = Arc::new(AtomicUsize::new(0)); + let mem_notify = Arc::new(tokio::sync::Notify::new()); + + // Progress: total is set by Lister once listing finishes, read by Emitter. + let total_manifests: Arc> = Arc::new(std::sync::OnceLock::new()); + + // --- Lister task --- + // Lists manifest locations, applies min_version filter, and counts the + // total. Locations are lightweight so we buffer up to MAX_BUFFERED_LOCATIONS. + let tx_err_lister = tx.clone(); + let os_lister = object_store.clone(); + let base_lister = base.clone(); + let total_manifests_lister = total_manifests.clone(); + let min_version = options.min_version; + tokio::spawn(async move { + let result: lance_core::Result<()> = async { + let mut locations = + commit_handler.list_manifest_locations(&base_lister, &os_lister, false); + let mut count = 0usize; + while let Some(loc) = locations.next().await { + let loc = loc?; + if let Some(min_v) = min_version + && loc.version < min_v + { + continue; + } + count += 1; + if tx_locations.send(loc).await.is_err() { + return Ok(()); + } + } + let _ = total_manifests_lister.set(count); + Ok(()) + } + .await; + if let Err(e) = result { + let _ = tx_err_lister + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + } + }); + + // --- Reader task --- + // Reads manifests with memory-aware parallelism and fans out to + // Emitter (file batches) and IndexLister (index metadata). + let tx_err_reader = tx.clone(); + let os_reader = object_store.clone(); + let base_reader = base.clone(); + let inflight_mem_reader = inflight_mem.clone(); + let mem_notify_reader = mem_notify.clone(); + tokio::spawn(async move { + let result: lance_core::Result<()> = async { + let max_parallelism = os_reader.io_parallelism(); + + type ManifestResult = lance_core::Result<( + Arc, + String, + Vec, + usize, + )>; + let mut in_flight: FuturesUnordered< + std::pin::Pin + Send>>, + > = FuturesUnordered::new(); + let mut locations_exhausted = false; + + loop { + let can_launch = !locations_exhausted + && in_flight.len() < max_parallelism + && (in_flight.is_empty() + || inflight_mem_reader.load(Ordering::Acquire) + < MANIFEST_MEMORY_BUDGET); + + if in_flight.is_empty() && !can_launch { + break; + } + + tokio::select! { + biased; + // Always drain completed reads first. + Some(item) = in_flight.next(), if !in_flight.is_empty() => { + let (manifest, manifest_path, indexes, estimated) = item?; + let version = manifest.version; + if tx_manifest + .send((manifest, manifest_path, estimated)) + .await + .is_err() + { + return Ok(()); + } + if !indexes.is_empty() + && tx_indexes.send((version, indexes)).await.is_err() + { + return Ok(()); + } + } + // Receive next location and start a read. + loc = rx_locations.recv(), if can_launch => { + match loc { + Some(loc) => { + let estimated = + loc.size.unwrap_or(0) as usize + * MANIFEST_DECOMPRESSION_RATIO; + inflight_mem_reader.fetch_add(estimated, Ordering::AcqRel); + + let os = os_reader.clone(); + let base = base_reader.clone(); + in_flight.push(Box::pin(async move { + let manifest = + read_manifest(&os, &loc.path, loc.size).await?; + let indexes = + read_manifest_indexes(&os, &loc, &manifest).await?; + let manifest_path = + remove_prefix(&loc.path, &base).to_string(); + lance_core::Result::Ok(( + Arc::new(manifest), + manifest_path, + indexes, + estimated, + )) + })); + } + None => { + locations_exhausted = true; + } + } + } + // Wake up when Emitter frees memory. + _ = mem_notify_reader.notified(), + if !can_launch && !in_flight.is_empty() => {} + } + } + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx_err_reader + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + } + }); + + // --- Emitter task --- + // Converts manifests into file-row batches, releases memory budget, + // and reports progress. + let tx_emitter = tx.clone(); + let uri_emitter = uri.clone(); + let progress_cb = options.progress; + tokio::spawn(async move { + let mut processed = 0usize; + while let Some((manifest, manifest_path, estimated)) = rx_manifest.recv().await { + let batches = manifest_file_batches(&manifest, &uri_emitter, &manifest_path); + for batch_result in batches { + let df_result = batch_result.map_err(datafusion::error::DataFusionError::from); + if tx_emitter.send(df_result).await.is_err() { + return; + } + } + drop(manifest); + inflight_mem.fetch_sub(estimated, Ordering::AcqRel); + mem_notify.notify_one(); + + processed += 1; + if let Some(ref cb) = progress_cb { + cb(TrackedFilesProgress { + manifests_processed: processed, + manifests_total: total_manifests.get().copied(), + }); + } + } + }); + + // --- IndexLister task --- + // Lists index directories and emits index file batches. + let tx_idx = tx; + let uri_idx = uri; + let os_idx = object_store; + let base_idx = base; + tokio::spawn(async move { + let mut uuid_cache: HashMap> = HashMap::new(); + while let Some((version, indexes)) = rx_indexes.recv().await { + let uuids: Vec = indexes.iter().map(|idx| idx.uuid).collect(); + match get_index_files(uuids, &base_idx, &os_idx, &mut uuid_cache).await { + Ok(index_paths) if !index_paths.is_empty() => { + match index_file_batch(version, &uri_idx, &index_paths).await { + Ok(batch) => { + if tx_idx.send(Ok(batch)).await.is_err() { + return; + } + } + Err(e) => { + let _ = tx_idx + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + return; + } + } + } + Err(e) => { + let _ = tx_idx + .send(Err(datafusion::error::DataFusionError::from(e))) + .await; + return; + } + _ => {} + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + + Box::pin(RecordBatchStreamAdapter::new( + TRACKED_FILES_SCHEMA.clone(), + stream, + )) + } + + /// Returns one row per file that physically exists at the dataset's base URI. + /// + /// This scans the primary object store root only. Additional `base_paths` + /// entries in the manifest (for externally-located data files) are not + /// scanned by this method. + /// + /// # Schema + /// + /// | Column | Type | Notes | + /// |-----------------|--------------------------------------------|-------| + /// | `base_uri` | `Dictionary(Int32, Utf8)` (non-null) | Storage root | + /// | `path` | `Utf8` (non-null) | Relative to `base_uri` | + /// | `size_bytes` | `Int64` (non-null) | File size in bytes | + /// | `last_modified` | `Timestamp(Microsecond, "UTC")` (non-null) | Last modification time | + pub async fn all_files(&self) -> SendableRecordBatchStream { + let base = self.base.clone(); + let uri = self.uri().to_string(); + let object_store = self.object_store.clone(); + + let stream = object_store + .list(Some(base.clone())) + .try_chunks(4000) + .map_err(|err| err.1) + .and_then( + move |chunk| match build_all_files_batch(&chunk, &base, &uri) { + Ok(batch) => futures::future::ok(batch), + Err(e) => futures::future::err(e), + }, + ) + .map_err(datafusion::error::DataFusionError::from); + + Box::pin(RecordBatchStreamAdapter::new( + arrow::ALL_FILES_SCHEMA.clone(), + stream, + )) + } +} + +fn build_all_files_batch( + chunk: &[object_store::ObjectMeta], + base: &Path, + uri: &str, +) -> Result { + let n = chunk.len(); + let mut base_uri_builder = StringDictionaryBuilder::::with_capacity(n, 1, uri.len()); + let path_capacity = chunk.iter().map(|m| m.location.as_ref().len()).sum(); + let mut path_builder = StringBuilder::with_capacity(n, path_capacity); + let mut size_builder = Int64Builder::with_capacity(n); + let mut ts_builder = TimestampMicrosecondBuilder::with_capacity(n).with_timezone("UTC"); + + for meta in chunk { + let rel = remove_prefix(&meta.location, base); + base_uri_builder.append_value(uri); + path_builder.append_value(rel.as_ref()); + size_builder.append_value(meta.size as i64); + ts_builder.append_value(meta.last_modified.timestamp_micros()); + } + + RecordBatch::try_new( + arrow::ALL_FILES_SCHEMA.clone(), + vec![ + Arc::new(base_uri_builder.finish()), + Arc::new(path_builder.finish()), + Arc::new(size_builder.finish()), + Arc::new(ts_builder.finish()), + ], + ) + .map_err(Into::into) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Dataset; + use crate::index::DatasetIndexExt; + use crate::index::vector::VectorIndexParams; + use arrow_array::{Array, Int32Array, RecordBatchIterator, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use futures::TryStreamExt; + use lance_index::IndexType; + use lance_linalg::distance::MetricType; + use lance_testing::datagen::some_batch; + use std::collections::HashSet; + + async fn collect_rows(stream: SendableRecordBatchStream) -> Vec { + stream.try_collect::>().await.unwrap() + } + + fn count_rows(batches: &[RecordBatch]) -> usize { + batches.iter().map(|b| b.num_rows()).sum() + } + + fn dict_value_at(col: &dyn arrow_array::Array, i: usize) -> String { + if let Some(dict) = col + .as_any() + .downcast_ref::>() + { + let values = dict + .values() + .as_any() + .downcast_ref::() + .unwrap(); + values.value(dict.keys().value(i) as usize).to_string() + } else if let Some(dict) = col + .as_any() + .downcast_ref::>() + { + let values = dict + .values() + .as_any() + .downcast_ref::() + .unwrap(); + values.value(dict.keys().value(i) as usize).to_string() + } else { + panic!("expected a dictionary array with Int8 or Int32 keys"); + } + } + + fn collect_column_values(batches: &[RecordBatch], col: &str) -> Vec { + batches + .iter() + .flat_map(|b| { + let col = b.column_by_name(col).unwrap(); + (0..col.len()).map(|i| dict_value_at(col.as_ref(), i)) + }) + .collect() + } + + fn make_simple_batch() -> impl arrow_array::RecordBatchReader { + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + RecordBatchIterator::new(vec![Ok(batch)], schema) + } + + #[tokio::test] + async fn test_tracked_files_basic() { + let uri = "memory://test_tracked_files_basic"; + + // Create then append twice to get 3 manifest versions. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let stream = ds.tracked_files().await; + let schema = stream.schema(); + let batches = collect_rows(stream).await; + + // Schema is correct. + assert_eq!(schema.field(0).name(), "version"); + assert_eq!(schema.field(1).name(), "base_uri"); + assert_eq!(schema.field(2).name(), "path"); + assert_eq!(schema.field(3).name(), "type"); + + let n = count_rows(&batches); + // At minimum: 3 manifests + 3 data files = 6 rows + assert!(n >= 6, "expected at least 6 rows, got {n}"); + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!(types.contains("manifest"), "missing 'manifest' rows"); + assert!(types.contains("data file"), "missing 'data file' rows"); + } + + #[tokio::test] + async fn test_tracked_files_deletion() { + let uri = "memory://test_tracked_files_deletion"; + + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.delete("id = 2").await.unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("deletion file"), + "missing 'deletion file' rows after delete; got types: {:?}", + types + ); + } + + #[tokio::test] + async fn test_tracked_files_transaction() { + let uri = "memory://test_tracked_files_transaction"; + + // Normal writes record transaction files by default. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("transaction file"), + "expected 'transaction file' rows; got types: {:?}", + types + ); + } + + #[tokio::test] + async fn test_tracked_files_index() { + let uri = "memory://test_tracked_files_index"; + + let mut ds = Dataset::write(some_batch(), uri, None).await.unwrap(); + let params = VectorIndexParams::ivf_pq(2, 8, 2, MetricType::L2, 5); + ds.create_index(&["indexable"], IndexType::Vector, None, ¶ms, true) + .await + .unwrap(); + + let stream = ds.tracked_files().await; + let batches = collect_rows(stream).await; + + let types: HashSet = collect_column_values(&batches, "type") + .into_iter() + .collect(); + assert!( + types.contains("index file"), + "expected 'index file' rows after vector index creation; got types: {:?}", + types + ); + } + + fn collect_versions(batches: &[RecordBatch]) -> Vec { + batches + .iter() + .flat_map(|b| { + let col = b + .column_by_name("version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i)).collect::>() + }) + .collect() + } + + #[tokio::test] + async fn test_tracked_files_min_version() { + let uri = "memory://test_tracked_files_min_version"; + + // Create 3 versions. + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + // Without filter: should have rows from versions 1, 2, 3. + let stream = ds.tracked_files().await; + let all_batches = collect_rows(stream).await; + let all_versions: HashSet = collect_versions(&all_batches).into_iter().collect(); + assert!(all_versions.contains(&1)); + assert!(all_versions.contains(&2)); + assert!(all_versions.contains(&3)); + + // With min_version=3: should only have version 3. + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + min_version: Some(3), + ..Default::default() + }) + .await; + let filtered_batches = collect_rows(stream).await; + let filtered_versions: HashSet = + collect_versions(&filtered_batches).into_iter().collect(); + assert_eq!(filtered_versions, HashSet::from([3])); + + // With min_version=2: should have versions 2 and 3. + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + min_version: Some(2), + ..Default::default() + }) + .await; + let filtered_batches = collect_rows(stream).await; + let filtered_versions: HashSet = + collect_versions(&filtered_batches).into_iter().collect(); + assert_eq!(filtered_versions, HashSet::from([2, 3])); + } + + #[tokio::test] + async fn test_tracked_files_progress() { + let uri = "memory://test_tracked_files_progress"; + + let mut ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + ds.append(make_simple_batch(), None).await.unwrap(); + + let updates = Arc::new(std::sync::Mutex::new(Vec::new())); + let updates_clone = updates.clone(); + + let stream = ds + .tracked_files_with_options(TrackedFilesOptions { + progress: Some(Box::new(move |p| { + updates_clone.lock().unwrap().push(p); + })), + ..Default::default() + }) + .await; + // Consume the full stream to drive all tasks to completion. + let _batches = collect_rows(stream).await; + + let updates = updates.lock().unwrap(); + // Should have exactly 3 progress updates (one per manifest). + assert_eq!(updates.len(), 3, "expected 3 progress updates"); + // Processed counts should be monotonically increasing. + for (i, u) in updates.iter().enumerate() { + assert_eq!(u.manifests_processed, i + 1); + } + // The last update should know the total. + let last = updates.last().unwrap(); + assert_eq!(last.manifests_total, Some(3)); + } + + fn make_multi_row_batch(rows: usize) -> impl arrow_array::RecordBatchReader { + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..rows as i32))], + ) + .unwrap(); + RecordBatchIterator::new(vec![Ok(batch)], schema) + } + + /// Multi-fragment scenario: write 6 rows split across 3 fragments, delete + /// one row to produce a deletion file, then assert that every path + /// `tracked_files` emits for the latest version actually exists in the + /// `all_files` listing of the dataset directory. + #[tokio::test] + async fn test_tracked_files_paths_match_disk() { + use crate::dataset::WriteParams; + + let uri = "memory://test_tracked_files_paths_match_disk"; + + let write_params = WriteParams { + max_rows_per_file: 2, + ..Default::default() + }; + let mut ds = Dataset::write(make_multi_row_batch(6), uri, Some(write_params)) + .await + .unwrap(); + // Triggers a deletion file on one of the fragments. + ds.delete("id = 1").await.unwrap(); + let latest_version = ds.version().version as i64; + + // Sanity-check the multi-fragment setup: 3 data files in the latest manifest. + assert_eq!( + ds.get_fragments().len(), + 3, + "expected 3 fragments from max_rows_per_file=2 over 6 rows" + ); + + let tracked = collect_rows(ds.tracked_files().await).await; + let all = collect_rows(ds.all_files().await).await; + + let all_paths: HashSet = all + .iter() + .flat_map(|b| { + let col = b + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i).to_string()) + }) + .collect(); + + // Collect tracked paths grouped by type for the latest version only. + let mut tracked_at_latest: HashMap> = HashMap::new(); + for batch in &tracked { + let versions = batch + .column_by_name("version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let paths = batch + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let types = batch.column_by_name("type").unwrap(); + for i in 0..batch.num_rows() { + if versions.value(i) == latest_version { + tracked_at_latest + .entry(dict_value_at(types.as_ref(), i)) + .or_default() + .push(paths.value(i).to_string()); + } + } + } + + // Every file tracked at the latest version must exist on disk. + for (file_type, paths) in &tracked_at_latest { + for p in paths { + assert!( + all_paths.contains(p), + "tracked {file_type} path {p:?} not present in all_files (got {all_paths:?})" + ); + } + } + + // The latest manifest references one manifest, 3 data files, and 1 deletion file. + assert_eq!( + tracked_at_latest.get("manifest").map(Vec::len), + Some(1), + "expected 1 manifest row at latest version" + ); + assert_eq!( + tracked_at_latest.get("data file").map(Vec::len), + Some(3), + "expected 3 data files at latest version" + ); + assert_eq!( + tracked_at_latest.get("deletion file").map(Vec::len), + Some(1), + "expected 1 deletion file at latest version" + ); + + // Path shapes are as documented (relative to base_uri, no leading slash). + for p in tracked_at_latest.get("data file").unwrap() { + assert!( + p.starts_with("data/"), + "data file path {p:?} should start with data/" + ); + } + let manifest_path = &tracked_at_latest.get("manifest").unwrap()[0]; + assert!( + manifest_path.starts_with("_versions/") && manifest_path.ends_with(".manifest"), + "manifest path {manifest_path:?} should match _versions/.manifest" + ); + let deletion_path = &tracked_at_latest.get("deletion file").unwrap()[0]; + assert!( + deletion_path.starts_with("_deletions/"), + "deletion path {deletion_path:?} should start with _deletions/" + ); + } + + /// Each `DataFile` inside a fragment carries its own `base_id`; the + /// emitted `base_uri` must be looked up per file, not per fragment. + #[test] + fn test_manifest_file_rows_per_file_base_id() { + use lance_core::datatypes::{Field as LanceField, Schema as LanceSchema}; + use lance_io::utils::CachedFileSize; + use lance_table::format::{ + BasePath, DataFile, DataStorageFormat, DeletionFile, DeletionFileType, Fragment, + Manifest, + }; + + let schema = LanceSchema { + fields: vec![LanceField::try_from(&Field::new("id", DataType::Int32, false)).unwrap()], + metadata: Default::default(), + }; + + let mk_file = |path: &str, base_id: Option| DataFile { + path: path.to_string(), + fields: Arc::from(vec![0]), + column_indices: Arc::from(Vec::::new()), + file_major_version: 2, + file_minor_version: 0, + file_size_bytes: CachedFileSize::unknown(), + base_id, + }; + + let fragment = Fragment { + id: 0, + files: vec![ + mk_file("a.lance", Some(1)), + mk_file("b.lance", Some(2)), + // No base_id -> falls back to the dataset base_uri. + mk_file("c.lance", None), + ], + // Deletion files also carry a base_id when they originate from a + // shallow clone, and must resolve against base_paths too. + deletion_file: Some(DeletionFile { + read_version: 1, + id: 7, + file_type: DeletionFileType::Bitmap, + num_deleted_rows: Some(1), + base_id: Some(2), + }), + row_id_meta: None, + physical_rows: Some(3), + last_updated_at_version_meta: None, + created_at_version_meta: None, + }; + + let mut base_paths = HashMap::new(); + base_paths.insert( + 1, + BasePath::new(1, "s3://bucket-a/root".to_string(), None, false), + ); + base_paths.insert( + 2, + BasePath::new(2, "s3://bucket-b/root".to_string(), None, false), + ); + + let manifest = Manifest::new( + schema, + Arc::new(vec![fragment]), + DataStorageFormat::default(), + base_paths, + ); + + let rows: Vec<_> = + manifest_file_rows(&manifest, "memory://main", "_versions/1.manifest").collect(); + let by_path: HashMap<&str, &str> = rows + .iter() + .filter(|r| matches!(r.file_type, FileType::DataFile)) + .map(|r| (r.path.as_ref(), r.base_uri.as_ref())) + .collect(); + + assert_eq!(by_path.get("data/a.lance"), Some(&"s3://bucket-a/root")); + assert_eq!(by_path.get("data/b.lance"), Some(&"s3://bucket-b/root")); + assert_eq!(by_path.get("data/c.lance"), Some(&"memory://main")); + + let deletion = rows + .iter() + .find(|r| matches!(r.file_type, FileType::DeletionFile)) + .expect("deletion file row"); + assert_eq!(deletion.path.as_ref(), "_deletions/0-1-7.bin"); + assert_eq!(deletion.base_uri.as_ref(), "s3://bucket-b/root"); + } + + #[tokio::test] + async fn test_all_files_basic() { + let uri = "memory://test_all_files_basic"; + let ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + + let stream = ds.all_files().await; + let schema = stream.schema(); + let batches = collect_rows(stream).await; + + assert_eq!(schema.field(0).name(), "base_uri"); + assert_eq!(schema.field(1).name(), "path"); + assert_eq!(schema.field(2).name(), "size_bytes"); + assert_eq!(schema.field(3).name(), "last_modified"); + + let n = count_rows(&batches); + // A dataset always has at least a manifest and a data file. + assert!(n >= 2, "expected at least 2 physical files, got {n}"); + + // Verify sizes and timestamps are populated (non-zero). + for batch in &batches { + let sizes = batch + .column_by_name("size_bytes") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..sizes.len() { + assert!( + sizes.value(i) > 0, + "size_bytes should be positive, got {}", + sizes.value(i) + ); + } + + let ts = batch + .column_by_name("last_modified") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ts.len() { + assert!( + ts.value(i) > 0, + "last_modified should be positive, got {}", + ts.value(i) + ); + } + } + } + + #[tokio::test] + async fn test_all_files_schema() { + let uri = "memory://test_all_files_schema"; + let ds = Dataset::write(make_simple_batch(), uri, None) + .await + .unwrap(); + + let stream = ds.all_files().await; + let schema = stream.schema(); + + assert_eq!(schema.fields().len(), 4); + assert_eq!(schema.field(0).name(), "base_uri"); + assert!(matches!( + schema.field(0).data_type(), + DataType::Dictionary(_, _) + )); + assert_eq!(schema.field(1).name(), "path"); + assert_eq!(schema.field(1).data_type(), &DataType::Utf8); + assert_eq!(schema.field(2).name(), "size_bytes"); + assert_eq!(schema.field(2).data_type(), &DataType::Int64); + assert_eq!(schema.field(3).name(), "last_modified"); + assert!(matches!( + schema.field(3).data_type(), + DataType::Timestamp(TimeUnit::Microsecond, _) + )); + } +} diff --git a/rust/lance/src/dataset/files/arrow.rs b/rust/lance/src/dataset/files/arrow.rs new file mode 100644 index 00000000000..22d767fe13b --- /dev/null +++ b/rust/lance/src/dataset/files/arrow.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::sync::{Arc, LazyLock}; + +use arrow::array::{ArrayBuilder, Int8Builder}; +use arrow::datatypes::Int8Type; +use arrow_array::builder::{Int64Builder, StringBuilder, StringDictionaryBuilder}; +use arrow_array::types::Int32Type; +use arrow_array::{ArrayRef, DictionaryArray, RecordBatch}; +use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use lance_core::Result; + +use super::FileRow; +use super::file_types::FileType; + +pub static FILE_TYPE_DICT_ARRAY: LazyLock = LazyLock::new(|| { + let mut builder = StringBuilder::with_capacity(5, 20); + builder.append_value(FileType::Manifest.to_string()); + builder.append_value(FileType::DataFile.to_string()); + builder.append_value(FileType::DeletionFile.to_string()); + builder.append_value(FileType::TransactionFile.to_string()); + builder.append_value(FileType::IndexFile.to_string()); + Arc::new(builder.finish()) +}); + +pub struct FileTypeArrayBuilder { + builder: Int8Builder, +} + +impl FileTypeArrayBuilder { + pub fn with_capacity(capacity: usize) -> Self { + Self { + builder: Int8Builder::with_capacity(capacity), + } + } + + pub fn append_value(&mut self, file_type: FileType) { + let value = file_type.into(); + self.builder.append_value(value); + } + + pub fn finish(mut self) -> DictionaryArray { + let indices = self.builder.finish(); + DictionaryArray::new(indices, FILE_TYPE_DICT_ARRAY.clone()) + } +} + +pub(super) static TRACKED_FILES_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(Schema::new(vec![ + Field::new("version", DataType::Int64, false), + Field::new( + "base_uri", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("path", DataType::Utf8, false), + Field::new( + "type", + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8)), + false, + ), + ])) +}); + +pub(super) static ALL_FILES_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(Schema::new(vec![ + Field::new( + "base_uri", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("path", DataType::Utf8, false), + Field::new("size_bytes", DataType::Int64, false), + Field::new( + "last_modified", + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + false, + ), + ])) +}); + +/// Arrow batch builder for the `tracked_files` schema. +/// +/// Construct with [`with_capacity`](Self::with_capacity) to pre-size the +/// underlying buffers, then call [`extend`](Self::extend) to fill rows in bulk. +pub(super) struct TrackedFileBatch { + version: Int64Builder, + base_uri: StringDictionaryBuilder, + path: StringBuilder, + file_type: FileTypeArrayBuilder, +} + +impl TrackedFileBatch { + pub fn with_capacity(capacity: usize) -> Self { + Self { + version: Int64Builder::with_capacity(capacity), + // Most of the time, there is only 1 base_uri + base_uri: StringDictionaryBuilder::with_capacity(capacity, 1, 20), + path: StringBuilder::with_capacity(capacity, capacity * 50), + file_type: FileTypeArrayBuilder::with_capacity(capacity), + } + } + + pub fn append(&mut self, row: &FileRow) { + self.version.append_value(row.version as i64); + self.base_uri.append_value(&row.base_uri); + self.path.append_value(&row.path); + self.file_type.append_value(row.file_type); + } + + pub fn len(&self) -> usize { + self.version.len() + } + + pub fn finish(mut self) -> Result { + RecordBatch::try_new( + TRACKED_FILES_SCHEMA.clone(), + vec![ + Arc::new(self.version.finish()), + Arc::new(self.base_uri.finish()), + Arc::new(self.path.finish()), + Arc::new(self.file_type.finish()), + ], + ) + .map_err(Into::into) + } +} diff --git a/rust/lance/src/dataset/files/file_types.rs b/rust/lance/src/dataset/files/file_types.rs new file mode 100644 index 00000000000..7c20a81eae5 --- /dev/null +++ b/rust/lance/src/dataset/files/file_types.rs @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// Discriminants are the dictionary keys used in the `tracked_files` output +// schema; they must stay in sync with `FILE_TYPE_DICT_ARRAY` in `arrow.rs`. +#[repr(i8)] +#[derive(Debug, Clone, Copy)] +pub enum FileType { + Manifest = 0, + DataFile = 1, + DeletionFile = 2, + TransactionFile = 3, + IndexFile = 4, +} + +impl std::fmt::Display for FileType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::Manifest => "manifest", + Self::DataFile => "data file", + Self::DeletionFile => "deletion file", + Self::TransactionFile => "transaction file", + Self::IndexFile => "index file", + }; + write!(f, "{s}") + } +} + +impl From for i8 { + fn from(file_type: FileType) -> Self { + file_type as Self + } +} From d62489375b041015115b15bf0e3d5c6c63e8e246 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Fri, 5 Jun 2026 22:15:29 +0000 Subject: [PATCH 041/177] chore: release beta version 8.0.0-beta.6 --- .bumpversion.toml | 2 +- Cargo.lock | 46 +++++++++++++++++++-------------------- Cargo.toml | 42 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 38 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 38 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 86 insertions(+), 86 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 5b36554af95..f237926b6a6 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.5" +current_version = "8.0.0-beta.6" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 03934fe744f..be1c25fa5e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4479,7 +4479,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "all_asserts", "approx", @@ -4583,7 +4583,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4631,7 +4631,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrayref", "paste", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4677,7 +4677,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4710,7 +4710,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4730,7 +4730,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4775,7 +4775,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "all_asserts", "arrow", @@ -4801,7 +4801,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4841,7 +4841,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "datafusion", "geo-traits", @@ -4855,7 +4855,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "approx", "arc-swap", @@ -4934,7 +4934,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-arith", @@ -4983,7 +4983,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "approx", "arrow-array", @@ -5003,7 +5003,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "async-trait", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-schema", @@ -5031,7 +5031,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -5090,7 +5090,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -5109,7 +5109,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -5156,7 +5156,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "proc-macro2", "quote", @@ -5165,7 +5165,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-schema", @@ -5178,7 +5178,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5190,7 +5190,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 5b94e2adfa3..89698cb4580 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.5", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.5", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.5", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.5", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.5", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.5", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.5", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.5", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.5", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.5", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.5", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.5", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.5", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.6", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.6", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.6", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.6", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.6", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.6", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.6", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.6", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.6", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.6", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.6", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.6", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.6", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.5", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.5", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.5", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.5", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.5", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.6", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.6", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.6", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.6", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.6", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.5", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.6", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.5", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.6", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 1e5a23e153e..fa08fd758aa 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3770,7 +3770,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arc-swap", "arrow", @@ -3844,7 +3844,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -3886,7 +3886,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrayref", "paste", @@ -3895,7 +3895,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -3930,7 +3930,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -3962,7 +3962,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -3980,7 +3980,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4046,7 +4046,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "datafusion", "geo-traits", @@ -4060,7 +4060,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arc-swap", "arrow", @@ -4130,7 +4130,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-arith", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "async-trait", @@ -4236,7 +4236,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-ipc", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4334,7 +4334,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 723a213bac2..5eaa69f071b 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index fbedf563249..c47523939a5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.5 + 8.0.0-beta.6 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 86fed6f38c6..a00cce8038f 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4136,7 +4136,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arc-swap", "arrow", @@ -4211,7 +4211,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4253,7 +4253,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrayref", "paste", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4297,7 +4297,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4329,7 +4329,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4347,7 +4347,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4382,7 +4382,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-arith", "arrow-array", @@ -4413,7 +4413,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "datafusion", "geo-traits", @@ -4427,7 +4427,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arc-swap", "arrow", @@ -4498,7 +4498,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-arith", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4556,7 +4556,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "async-trait", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-ipc", @@ -4612,7 +4612,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", @@ -4668,7 +4668,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6156,7 +6156,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index 654872ee863..9c7800d3c83 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.5" +version = "8.0.0-beta.6" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 13854cc0fd9df9872effaad49078308e26e01588 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 5 Jun 2026 15:41:41 -0700 Subject: [PATCH 042/177] refactor: clean up lance-index/lance-table boundary (#6988) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure refactor in preparation for moving transaction logic into `lance-table`. No behavior change. I'm making this change so that I can put the transaction and conflict resolution logic into `lance-table` in the future. **Move A — bloom filter to `lance-core`.** The split-block bloom filter primitive (`Sbbf`, `SbbfBuilder`, `AsBytes`) moves from `lance-index::scalar::bloomfilter` to `lance-core::utils::bloomfilter`. It's a storage-agnostic data structure with no Lance semantics, so it belongs in a lower-level crate. `libm`/`twox-hash` move with it from `lance-index` to `lance-core`, and the in-crate consumer plus the `rust/lance` callers import from the new path. **Move B — system indices to `lance-table::system_index`.** `frag_reuse` and `mem_wal` are table-level structure persisted as indices: the table format interprets their contents (fragment remapping, row visibility) rather than treating them as opaque behind `IndexMetadata::index_details`. Their structs, pb conversions, and logic move to a new public `lance-table::system_index` module. The `impl Index` adapters stay in `lance-index` (the `Index` trait is local there) and re-export the relocated structs from `lance-table`, so all `lance_index::frag_reuse::*` / `lance_index::mem_wal::*` paths continue to work. The single `itertools::sorted_by_key` use is replaced with `sort_by_key` to avoid pulling `itertools` into `lance-table`. Closes #6987 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 (1M context) --- Cargo.lock | 4 +- Cargo.toml | 1 + python/Cargo.lock | 4 +- rust/lance-core/Cargo.toml | 2 + rust/lance-core/src/utils.rs | 1 + rust/lance-core/src/utils/bloomfilter.rs | 10 + .../src/utils}/bloomfilter/as_bytes.rs | 2 +- .../src/utils}/bloomfilter/sbbf.rs | 2 +- rust/lance-index/Cargo.toml | 2 - rust/lance-index/src/frag_reuse.rs | 491 +----------------- rust/lance-index/src/mem_wal.rs | 403 +------------- rust/lance-index/src/scalar/bloomfilter.rs | 5 +- rust/lance-table/src/lib.rs | 1 + rust/lance-table/src/system_index.rs | 15 + .../src/system_index/frag_reuse.rs | 480 +++++++++++++++++ rust/lance-table/src/system_index/mem_wal.rs | 400 ++++++++++++++ rust/lance/src/dataset/mem_wal/memtable.rs | 2 +- .../src/dataset/mem_wal/memtable/flush.rs | 2 +- .../mem_wal/scanner/exec/bloom_guard.rs | 2 +- .../dataset/mem_wal/scanner/point_lookup.rs | 2 +- .../write/merge_insert/inserted_rows.rs | 2 +- 21 files changed, 945 insertions(+), 888 deletions(-) create mode 100644 rust/lance-core/src/utils/bloomfilter.rs rename rust/{lance-index/src/scalar => lance-core/src/utils}/bloomfilter/as_bytes.rs (98%) rename rust/{lance-index/src/scalar => lance-core/src/utils}/bloomfilter/sbbf.rs (99%) create mode 100644 rust/lance-table/src/system_index.rs create mode 100644 rust/lance-table/src/system_index/frag_reuse.rs create mode 100644 rust/lance-table/src/system_index/mem_wal.rs diff --git a/Cargo.lock b/Cargo.lock index be1c25fa5e5..d9d7588827e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4655,6 +4655,7 @@ dependencies = [ "itertools 0.13.0", "lance-arrow", "libc", + "libm", "log", "moka", "num_cpus", @@ -4672,6 +4673,7 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] @@ -4905,7 +4907,6 @@ dependencies = [ "lance-table", "lance-testing", "lance-tokenizer", - "libm", "libsais-rs", "log", "ndarray", @@ -4928,7 +4929,6 @@ dependencies = [ "test-log", "tokio", "tracing", - "twox-hash", "uuid", ] diff --git a/Cargo.toml b/Cargo.toml index 89698cb4580..6a267348167 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -202,6 +202,7 @@ tower = "0.5" tower-http = "0.5" tracing = "0.1" tracing-mock = { version = "=0.1.0-beta.3" } +twox-hash = "2.0" url = "2.5.7" uuid = { version = "1.2", features = ["v4", "serde"] } wiremock = "0.6" diff --git a/python/Cargo.lock b/python/Cargo.lock index a00cce8038f..7867ea71446 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4277,6 +4277,7 @@ dependencies = [ "itertools 0.13.0", "lance-arrow", "libc", + "libm", "log", "moka", "num_cpus", @@ -4292,6 +4293,7 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] @@ -4472,7 +4474,6 @@ dependencies = [ "lance-select", "lance-table", "lance-tokenizer", - "libm", "libsais-rs", "log", "ndarray", @@ -4492,7 +4493,6 @@ dependencies = [ "tempfile", "tokio", "tracing", - "twox-hash", "uuid", ] diff --git a/rust/lance-core/Cargo.toml b/rust/lance-core/Cargo.toml index 9dff4b001a4..9fb77e376bc 100644 --- a/rust/lance-core/Cargo.toml +++ b/rust/lance-core/Cargo.toml @@ -25,6 +25,7 @@ deepsize.workspace = true futures.workspace = true itertools.workspace = true libc.workspace = true +libm.workspace = true moka.workspace = true num_cpus = "1.0" object_store = { workspace = true } @@ -39,6 +40,7 @@ tokio.workspace = true tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true +twox-hash.workspace = true url.workspace = true log.workspace = true diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs index a7ac74a5b27..8f16744b158 100644 --- a/rust/lance-core/src/utils.rs +++ b/rust/lance-core/src/utils.rs @@ -7,6 +7,7 @@ pub mod assume; pub mod backoff; pub mod bit; pub mod blob; +pub mod bloomfilter; pub mod cpu; pub mod deletion; pub mod futures; diff --git a/rust/lance-core/src/utils/bloomfilter.rs b/rust/lance-core/src/utils/bloomfilter.rs new file mode 100644 index 00000000000..46cc272a694 --- /dev/null +++ b/rust/lance-core/src/utils/bloomfilter.rs @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Generic bloom filter primitives. +//! +//! These are storage-agnostic data structures with no Lance semantics, used by +//! higher-level crates (e.g. the bloom filter scalar index in `lance-index`). + +pub mod as_bytes; +pub mod sbbf; diff --git a/rust/lance-index/src/scalar/bloomfilter/as_bytes.rs b/rust/lance-core/src/utils/bloomfilter/as_bytes.rs similarity index 98% rename from rust/lance-index/src/scalar/bloomfilter/as_bytes.rs rename to rust/lance-core/src/utils/bloomfilter/as_bytes.rs index 22df8d6af7c..86b9632ce39 100644 --- a/rust/lance-index/src/scalar/bloomfilter/as_bytes.rs +++ b/rust/lance-core/src/utils/bloomfilter/as_bytes.rs @@ -7,7 +7,7 @@ //! similar to parquet::data_type::AsBytes but without the external dependency. /// Trait to convert primitive types to byte slices -/// Reference: https://arrow.apache.org/rust/src/parquet/data_type.rs.html +/// Reference: pub trait AsBytes { /// Convert the value to a byte slice fn as_bytes(&self) -> impl AsRef<[u8]>; diff --git a/rust/lance-index/src/scalar/bloomfilter/sbbf.rs b/rust/lance-core/src/utils/bloomfilter/sbbf.rs similarity index 99% rename from rust/lance-index/src/scalar/bloomfilter/sbbf.rs rename to rust/lance-core/src/utils/bloomfilter/sbbf.rs index cbb4eb76b12..06df2641008 100644 --- a/rust/lance-index/src/scalar/bloomfilter/sbbf.rs +++ b/rust/lance-core/src/utils/bloomfilter/sbbf.rs @@ -28,7 +28,7 @@ //! removed from Lance. //! -use crate::scalar::bloomfilter::as_bytes::AsBytes; +use super::as_bytes::AsBytes; use libm::lgamma; use std::error::Error; use std::fmt; diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index 07d74760b75..b2041257161 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -50,7 +50,6 @@ lance-linalg.workspace = true lance-select.workspace = true lance-tokenizer.workspace = true lance-table.workspace = true -libm.workspace = true log.workspace = true ndarray.workspace = true num-traits.workspace = true @@ -70,7 +69,6 @@ crossbeam-queue.workspace = true bytes.workspace = true chrono.workspace = true uuid.workspace = true -twox-hash = "2.0" async-channel = "2.3.1" bitpacking = { version = "0.9.2", features = ["bitpacker4x"] } rand_distr.workspace = true diff --git a/rust/lance-index/src/frag_reuse.rs b/rust/lance-index/src/frag_reuse.rs index d145108d3c0..d09d8dc0684 100644 --- a/rust/lance-index/src/frag_reuse.rs +++ b/rust/lance-index/src/frag_reuse.rs @@ -1,357 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use crate::{Index, IndexType}; -use arrow_array::cast::AsArray; -use arrow_array::types::UInt64Type; -use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; -use async_trait::async_trait; -use deepsize::{Context, DeepSizeOf}; -use itertools::Itertools; -use lance_core::{Error, Result}; -use lance_select::RowAddrTreeMap; -use lance_table::format::pb::fragment_reuse_index_details::InlineContent; -use lance_table::format::{ExternalFile, Fragment, pb}; -use roaring::{RoaringBitmap, RoaringTreemap}; -use serde::{Deserialize, Serialize}; -use std::{any::Any, collections::HashMap, sync::Arc}; -use uuid::Uuid; - -pub const FRAG_REUSE_INDEX_NAME: &str = "__lance_frag_reuse"; -pub const FRAG_REUSE_DETAILS_FILE_NAME: &str = "details.binpb"; - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragDigest { - pub id: u64, - pub physical_rows: usize, - pub num_deleted_rows: usize, -} - -impl From<&FragDigest> for pb::fragment_reuse_index_details::FragmentDigest { - fn from(digest: &FragDigest) -> Self { - Self { - id: digest.id, - physical_rows: digest.physical_rows as u64, - num_deleted_rows: digest.num_deleted_rows as u64, - } - } -} - -impl From<&Fragment> for FragDigest { - fn from(fragment: &Fragment) -> Self { - Self { - id: fragment.id, - physical_rows: fragment - .physical_rows - .expect("Fragment doesn't have physical rows recorded"), - num_deleted_rows: fragment - .deletion_file - .as_ref() - .and_then(|d| d.num_deleted_rows) - .unwrap_or(0), - } - } -} - -impl TryFrom for FragDigest { - type Error = Error; - - fn try_from(digest: pb::fragment_reuse_index_details::FragmentDigest) -> Result { - Ok(Self { - id: digest.id, - physical_rows: digest.physical_rows as usize, - num_deleted_rows: digest.num_deleted_rows as usize, - }) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseGroup { - pub changed_row_addrs: Vec, - pub old_frags: Vec, - pub new_frags: Vec, -} - -impl From<&FragReuseGroup> for pb::fragment_reuse_index_details::Group { - fn from(group: &FragReuseGroup) -> Self { - Self { - changed_row_addrs: group.changed_row_addrs.clone(), - old_fragments: group.old_frags.iter().map(|f| f.into()).collect(), - new_fragments: group.new_frags.iter().map(|f| f.into()).collect(), - } - } -} - -impl TryFrom for FragReuseGroup { - type Error = Error; - - fn try_from(group: pb::fragment_reuse_index_details::Group) -> Result { - Ok(Self { - changed_row_addrs: group.changed_row_addrs, - old_frags: group - .old_fragments - .into_iter() - .map(FragDigest::try_from) - .collect::>()?, - new_frags: group - .new_fragments - .into_iter() - .map(FragDigest::try_from) - .collect::>()?, - }) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseVersion { - pub dataset_version: u64, - pub groups: Vec, -} - -impl From<&FragReuseVersion> for pb::fragment_reuse_index_details::Version { - fn from(version: &FragReuseVersion) -> Self { - Self { - dataset_version: version.dataset_version, - groups: version.groups.iter().map(|g| g.into()).collect(), - } - } -} - -impl TryFrom for FragReuseVersion { - type Error = Error; - - fn try_from(version: pb::fragment_reuse_index_details::Version) -> Result { - Ok(Self { - dataset_version: version.dataset_version, - groups: version - .groups - .into_iter() - .map(FragReuseGroup::try_from) - .collect::>()?, - }) - } -} - -impl FragReuseVersion { - pub fn old_frag_ids(&self) -> Vec { - self.groups - .iter() - .flat_map(|g| g.old_frags.iter().map(|f| f.id)) - .collect::>() - } - - pub fn new_frag_ids(&self) -> Vec { - self.groups - .iter() - .flat_map(|g| g.new_frags.iter().map(|f| f.id)) - .collect::>() - } - - pub fn new_frag_bitmap(&self) -> RoaringBitmap { - RoaringBitmap::from_iter(self.new_frag_ids().iter().map(|&id| id as u32)) - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub enum FragReuseIndexDetailsContentType { - Inline(FragReuseIndexDetails), - External(ExternalFile), -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FragReuseIndexDetails { - pub versions: Vec, -} - -impl From<&FragReuseIndexDetails> for InlineContent { - fn from(details: &FragReuseIndexDetails) -> Self { - Self { - versions: details - .versions - .iter() - .map(|m| m.into()) - // sort from oldest to latest version - .sorted_by_key(|v: &pb::fragment_reuse_index_details::Version| v.dataset_version) - .collect(), - } - } -} +//! `Index`-trait adapter for the fragment-reuse system index. +//! +//! The data structures and table-format logic live in +//! [`lance_table::system_index::frag_reuse`]; this module re-exports them and +//! implements the local [`Index`] trait for [`FragReuseIndex`]. -impl TryFrom for FragReuseIndexDetails { - type Error = Error; +use std::any::Any; +use std::sync::Arc; - fn try_from(content: InlineContent) -> Result { - Ok(Self { - versions: content - .versions - .into_iter() - .map(|m| m.try_into()) - .collect::>>()?, - }) - } -} - -impl FragReuseIndexDetails { - pub fn new_frag_bitmap(&self) -> RoaringBitmap { - RoaringBitmap::from_iter( - self.versions - .iter() - .flat_map(|v| v.new_frag_ids().into_iter().map(|id| id as u32)), - ) - } -} - -/// An index that stores row ID maps. -/// A row ID map describes the mapping from old row address to new address after compactions. -/// Each version contains the mapping for one round of compaction. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct FragReuseIndex { - pub uuid: Uuid, - pub row_id_maps: Vec>>, - pub details: FragReuseIndexDetails, -} - -impl DeepSizeOf for FragReuseIndex { - fn deep_size_of_children(&self, cx: &mut Context) -> usize { - self.row_id_maps.deep_size_of_children(cx) + self.details.deep_size_of_children(cx) - } -} - -impl FragReuseIndex { - pub fn new( - uuid: Uuid, - row_id_maps: Vec>>, - details: FragReuseIndexDetails, - ) -> Self { - Self { - uuid, - row_id_maps, - details, - } - } - - pub fn remap_row_id(&self, row_id: u64) -> Option { - let mut mapped_value = Some(row_id); - for row_id_map in self.row_id_maps.iter() { - if mapped_value.is_some() { - mapped_value = row_id_map - .get(&mapped_value.unwrap()) - .copied() - .unwrap_or(mapped_value); - } - } - - mapped_value - } - - pub fn remap_row_addrs_tree_map(&self, row_addrs: &RowAddrTreeMap) -> RowAddrTreeMap { - RowAddrTreeMap::from_iter(row_addrs.row_addrs().unwrap().filter_map(|addr| { - let addr_as_u64 = u64::from(addr); - self.remap_row_id(addr_as_u64) - })) - } - - pub fn remap_row_ids_roaring_tree_map(&self, row_ids: &RoaringTreemap) -> RoaringTreemap { - RoaringTreemap::from_iter(row_ids.iter().filter_map(|addr| self.remap_row_id(addr))) - } - - /// Remap a record batch that contains a row_id column at index `row_id_idx` - /// Currently this assumes there are only 2 columns in the schema, - /// which is the case for all indexes. - /// For example, for btree, the schema is (value, row_id). - /// For vector index storage, the schema is (row_id, vector). - pub fn remap_row_ids_record_batch( - &self, - batch: RecordBatch, - row_id_idx: usize, - ) -> Result { - assert_eq!(batch.schema().fields().len(), 2); - let other_column_idx = 1 - row_id_idx; - let row_ids = batch.column(row_id_idx).as_primitive::(); - let (val_indices, new_row_ids): (Vec, Vec) = row_ids - .values() - .iter() - .enumerate() - .filter_map(|(idx, old_id)| { - self.remap_row_id(*old_id) - .map(|new_id| (idx as u64, new_id)) - }) - .unzip(); - let new_val_indices = UInt64Array::from_iter_values(val_indices); - let new_vals = - arrow_select::take::take(batch.column(other_column_idx), &new_val_indices, None)?; - - let mut batch_data: Vec<(usize, ArrayRef)> = vec![ - ( - row_id_idx, - Arc::new(UInt64Array::from_iter_values(new_row_ids)) as ArrayRef, - ), - (other_column_idx, Arc::new(new_vals)), - ]; - batch_data.sort_by_key(|(i, _)| *i); - Ok(RecordBatch::try_new( - batch.schema(), - batch_data.into_iter().map(|(_, item)| item).collect(), - )?) - } - - pub fn remap_row_ids_array(&self, array: ArrayRef) -> PrimitiveArray { - let primitive_array = array - .as_any() - .downcast_ref::>() - .expect("expected row IDs to be uint64 array"); - (0..primitive_array.len()) - .map(|i| { - if primitive_array.is_null(i) { - None - } else { - self.remap_row_id(primitive_array.value(i)) - } - }) - .collect() - } - - pub fn remap_fragment_bitmap(&self, fragment_bitmap: &mut RoaringBitmap) -> Result<()> { - for version in self.details.versions.iter() { - for group in version.groups.iter() { - let mut removed = 0; - for old_frag in group.old_frags.iter() { - if fragment_bitmap.remove(old_frag.id as u32) { - removed += 1; - } - } +use async_trait::async_trait; +use lance_core::{Error, Result}; +use roaring::RoaringBitmap; +use serde::Serialize; - if removed > 0 { - if removed != group.old_frags.len() { - // Straddle: the index covered only part of this rewrite - // group. Caused by the bug fixed in - // . - // We've already removed the indexed old_frags from the - // bitmap above; deliberately do NOT insert new_frags, - // since the merged fragment also contains rows that - // were never indexed. Affected rows fall through to - // flat scan until the next optimize_indices. The fix - // is persisted on the next write via build_manifest. - tracing::warn!( - "Healing straddling fragment-reuse rewrite group in index bitmap: \ - group {:?} was only partially indexed ({} of {} old fragments). \ - Affected rows will use flat scan until the next optimize_indices.", - group.old_frags, - removed, - group.old_frags.len(), - ); - continue; - } +pub use lance_table::system_index::frag_reuse::*; - for new_frag in group.new_frags.iter() { - fragment_bitmap.insert(new_frag.id as u32); - } - } - } - } - Ok(()) - } -} +use crate::{Index, IndexType}; #[derive(Serialize)] struct FragReuseStatistics { @@ -398,134 +64,3 @@ impl Index for FragReuseIndex { unimplemented!() } } - -#[cfg(test)] -mod tests { - - use super::*; - - #[tokio::test] - async fn test_serialize_deserialize_index_details() { - // Create sample FragReuseVersions with different dataset versions - let version1 = FragReuseVersion { - dataset_version: 2, - groups: vec![FragReuseGroup { - changed_row_addrs: vec![1, 2, 3], - old_frags: vec![FragDigest { - id: 1, - physical_rows: 1, - num_deleted_rows: 0, - }], - new_frags: vec![ - FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 3, - physical_rows: 1, - num_deleted_rows: 0, - }, - ], - }], - }; - - let version2 = FragReuseVersion { - dataset_version: 1, - groups: vec![FragReuseGroup { - changed_row_addrs: vec![4, 5, 6], - old_frags: vec![FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }], - new_frags: vec![ - FragDigest { - id: 4, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 5, - physical_rows: 1, - num_deleted_rows: 0, - }, - ], - }], - }; - - // Create FragReuseIndexDetails with versions in reverse order - let details = FragReuseIndexDetails { - versions: vec![version1, version2], - }; - - // Convert to protobuf format - let inline_content: InlineContent = (&details).into(); - - // Convert back to FragReuseIndexDetails - let roundtrip_details = FragReuseIndexDetails::try_from(inline_content).unwrap(); - - // Verify the roundtrip - assert_eq!(roundtrip_details.versions.len(), 2); - - // Verify versions are sorted by dataset_version (oldest to latest) - assert_eq!(roundtrip_details.versions[0].dataset_version, 1); - assert_eq!( - roundtrip_details.versions[0].groups[0].changed_row_addrs, - vec![4, 5, 6] - ); - assert_eq!( - roundtrip_details.versions[0].groups[0].new_frags, - vec![ - FragDigest { - id: 4, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 5, - physical_rows: 1, - num_deleted_rows: 0, - } - ] - ); - assert_eq!( - roundtrip_details.versions[0].groups[0].old_frags, - vec![FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }] - ); - - assert_eq!(roundtrip_details.versions[1].dataset_version, 2); - assert_eq!( - roundtrip_details.versions[1].groups[0].changed_row_addrs, - vec![1, 2, 3] - ); - assert_eq!( - roundtrip_details.versions[1].groups[0].new_frags, - vec![ - FragDigest { - id: 2, - physical_rows: 1, - num_deleted_rows: 0, - }, - FragDigest { - id: 3, - physical_rows: 1, - num_deleted_rows: 0, - } - ] - ); - assert_eq!( - roundtrip_details.versions[1].groups[0].old_frags, - vec![FragDigest { - id: 1, - physical_rows: 1, - num_deleted_rows: 0, - }] - ); - } -} diff --git a/rust/lance-index/src/mem_wal.rs b/rust/lance-index/src/mem_wal.rs index 4310db88908..f8f42093894 100644 --- a/rust/lance-index/src/mem_wal.rs +++ b/rust/lance-index/src/mem_wal.rs @@ -1,408 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +//! `Index`-trait adapter for the MemWAL system index. +//! +//! The data structures and table-format logic live in +//! [`lance_table::system_index::mem_wal`]; this module re-exports them and +//! implements the local [`Index`] trait for [`MemWalIndex`]. + use std::any::Any; -use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; -use deepsize::DeepSizeOf; use lance_core::Error; -use lance_table::format::pb; use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -use crate::{Index, IndexType}; - -pub const MEM_WAL_INDEX_NAME: &str = "__lance_mem_wal"; - -/// Type alias for shard identifier (UUID v4). -pub type ShardId = Uuid; - -/// A flushed MemTable generation and its storage location. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct FlushedGeneration { - pub generation: u64, - pub path: String, -} - -impl From<&FlushedGeneration> for pb::FlushedGeneration { - fn from(fg: &FlushedGeneration) -> Self { - Self { - generation: fg.generation, - path: fg.path.clone(), - } - } -} - -impl From for FlushedGeneration { - fn from(fg: pb::FlushedGeneration) -> Self { - Self { - generation: fg.generation, - path: fg.path, - } - } -} - -/// A shard's merged generation, used in MemWalIndexDetails. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, Serialize, Deserialize)] -pub struct MergedGeneration { - pub shard_id: Uuid, - pub generation: u64, -} - -impl DeepSizeOf for MergedGeneration { - fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { - 0 // UUID is 16 bytes fixed size, no heap allocations - } -} - -impl MergedGeneration { - pub fn new(shard_id: Uuid, generation: u64) -> Self { - Self { - shard_id, - generation, - } - } -} - -impl From<&MergedGeneration> for pb::MergedGeneration { - fn from(mg: &MergedGeneration) -> Self { - Self { - shard_id: Some((&mg.shard_id).into()), - generation: mg.generation, - } - } -} - -impl TryFrom for MergedGeneration { - type Error = Error; - - fn try_from(mg: pb::MergedGeneration) -> lance_core::Result { - let shard_id = mg - .shard_id - .as_ref() - .map(Uuid::try_from) - .ok_or_else(|| Error::invalid_input("Missing shard_id in MergedGeneration"))??; - Ok(Self { - shard_id, - generation: mg.generation, - }) - } -} - -/// Tracks which merged generation a base table index has been rebuilt to cover. -/// Used to determine whether to read from flushed MemTable indexes or base table. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct IndexCatchupProgress { - pub index_name: String, - pub caught_up_generations: Vec, -} - -impl IndexCatchupProgress { - pub fn new(index_name: String, caught_up_generations: Vec) -> Self { - Self { - index_name, - caught_up_generations, - } - } - - /// Get the caught up generation for a specific shard. - /// Returns None if the shard is not present (assumed fully caught up). - pub fn caught_up_generation_for_shard(&self, shard_id: &Uuid) -> Option { - self.caught_up_generations - .iter() - .find(|mg| &mg.shard_id == shard_id) - .map(|mg| mg.generation) - } -} - -impl From<&IndexCatchupProgress> for pb::IndexCatchupProgress { - fn from(icp: &IndexCatchupProgress) -> Self { - Self { - index_name: icp.index_name.clone(), - caught_up_generations: icp - .caught_up_generations - .iter() - .map(|mg| mg.into()) - .collect(), - } - } -} - -impl TryFrom for IndexCatchupProgress { - type Error = Error; - - fn try_from(icp: pb::IndexCatchupProgress) -> lance_core::Result { - Ok(Self { - index_name: icp.index_name, - caught_up_generations: icp - .caught_up_generations - .into_iter() - .map(MergedGeneration::try_from) - .collect::>()?, - }) - } -} - -/// Shard manifest containing epoch-based fencing and WAL state. -/// Each shard has exactly one active writer at any time. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct ShardManifest { - pub shard_id: Uuid, - pub version: u64, - pub shard_spec_id: u32, - /// Computed shard field values as raw Arrow scalar bytes, keyed by field id. - /// The byte encoding follows Arrow's little-endian convention: int32 is 4 LE - /// bytes, utf8 is raw UTF-8 bytes, etc. The result_type in the corresponding - /// ShardingField from the ShardingSpec determines how to interpret each value. - pub shard_field_values: HashMap>, - pub writer_epoch: u64, - /// The most recent WAL entry position flushed to a MemTable. - /// Recovery replays from `replay_after_wal_entry_position + 1`. The - /// default value 0 means "no flush has ever stamped this shard" — WAL - /// positions themselves are 1-based, so 0 is never a valid covered - /// position. - pub replay_after_wal_entry_position: u64, - /// The most recent WAL entry position observed at manifest write time. - /// Default 0 means "no entry has been written yet"; WAL positions are - /// 1-based. - pub wal_entry_position_last_seen: u64, - pub current_generation: u64, - pub flushed_generations: Vec, -} - -impl DeepSizeOf for ShardManifest { - fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { - self.shard_field_values.deep_size_of_children(context) - + self.flushed_generations.deep_size_of_children(context) - } -} - -impl From<&ShardManifest> for pb::ShardManifest { - fn from(rm: &ShardManifest) -> Self { - Self { - shard_id: Some((&rm.shard_id).into()), - version: rm.version, - shard_spec_id: rm.shard_spec_id, - shard_field_entries: rm - .shard_field_values - .iter() - .map(|(k, v)| pb::ShardFieldEntry { - field_id: k.clone(), - value: v.clone(), - }) - .collect(), - writer_epoch: rm.writer_epoch, - replay_after_wal_entry_position: rm.replay_after_wal_entry_position, - wal_entry_position_last_seen: rm.wal_entry_position_last_seen, - current_generation: rm.current_generation, - flushed_generations: rm.flushed_generations.iter().map(|fg| fg.into()).collect(), - } - } -} - -impl TryFrom for ShardManifest { - type Error = Error; - - fn try_from(rm: pb::ShardManifest) -> lance_core::Result { - let shard_id = rm - .shard_id - .as_ref() - .map(Uuid::try_from) - .ok_or_else(|| Error::invalid_input("Missing shard_id in ShardManifest"))??; - let shard_field_values = rm - .shard_field_entries - .into_iter() - .map(|e| (e.field_id, e.value)) - .collect(); - Ok(Self { - shard_id, - version: rm.version, - shard_spec_id: rm.shard_spec_id, - shard_field_values, - writer_epoch: rm.writer_epoch, - replay_after_wal_entry_position: rm.replay_after_wal_entry_position, - wal_entry_position_last_seen: rm.wal_entry_position_last_seen, - current_generation: rm.current_generation, - flushed_generations: rm - .flushed_generations - .into_iter() - .map(FlushedGeneration::from) - .collect(), - }) - } -} - -/// Sharding field definition. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct ShardingField { - pub field_id: String, - pub source_ids: Vec, - pub transform: Option, - pub expression: Option, - pub result_type: String, - pub parameters: HashMap, -} +use serde::Serialize; -impl From<&ShardingField> for pb::ShardingField { - fn from(rf: &ShardingField) -> Self { - Self { - field_id: rf.field_id.clone(), - source_ids: rf.source_ids.clone(), - transform: rf.transform.clone(), - expression: rf.expression.clone(), - result_type: rf.result_type.clone(), - parameters: rf.parameters.clone(), - } - } -} - -impl From for ShardingField { - fn from(rf: pb::ShardingField) -> Self { - Self { - field_id: rf.field_id, - source_ids: rf.source_ids, - transform: rf.transform, - expression: rf.expression, - result_type: rf.result_type, - parameters: rf.parameters, - } - } -} - -/// Sharding spec definition. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct ShardingSpec { - pub spec_id: u32, - pub fields: Vec, -} - -impl From<&ShardingSpec> for pb::ShardingSpec { - fn from(rs: &ShardingSpec) -> Self { - Self { - spec_id: rs.spec_id, - fields: rs.fields.iter().map(|f| f.into()).collect(), - } - } -} +pub use lance_table::system_index::mem_wal::*; -impl From for ShardingSpec { - fn from(rs: pb::ShardingSpec) -> Self { - Self { - spec_id: rs.spec_id, - fields: rs.fields.into_iter().map(ShardingField::from).collect(), - } - } -} - -/// Index details for MemWAL Index, stored in IndexMetadata.index_details. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] -pub struct MemWalIndexDetails { - pub snapshot_ts_millis: i64, - pub num_shards: u32, - pub inline_snapshots: Option>, - pub sharding_specs: Vec, - pub maintained_indexes: Vec, - pub merged_generations: Vec, - pub index_catchup: Vec, - /// Default `ShardWriter` configuration values for this MemWAL index. - /// - /// Persisted so every writer — across processes and restarts — starts - /// from the same default writer configuration. These are defaults only; - /// an individual writer may still override any value at runtime in its - /// own (non-persisted) `ShardWriterConfig`. - pub writer_config_defaults: HashMap, -} - -impl From<&MemWalIndexDetails> for pb::MemWalIndexDetails { - fn from(details: &MemWalIndexDetails) -> Self { - Self { - snapshot_ts_millis: details.snapshot_ts_millis, - num_shards: details.num_shards, - inline_snapshots: details.inline_snapshots.clone(), - sharding_specs: details.sharding_specs.iter().map(|rs| rs.into()).collect(), - maintained_indexes: details.maintained_indexes.clone(), - merged_generations: details - .merged_generations - .iter() - .map(|mg| mg.into()) - .collect(), - index_catchup: details.index_catchup.iter().map(|icp| icp.into()).collect(), - writer_config_defaults: details.writer_config_defaults.clone(), - } - } -} - -impl TryFrom for MemWalIndexDetails { - type Error = Error; - - fn try_from(details: pb::MemWalIndexDetails) -> lance_core::Result { - Ok(Self { - snapshot_ts_millis: details.snapshot_ts_millis, - num_shards: details.num_shards, - inline_snapshots: details.inline_snapshots, - sharding_specs: details - .sharding_specs - .into_iter() - .map(ShardingSpec::from) - .collect(), - maintained_indexes: details.maintained_indexes, - merged_generations: details - .merged_generations - .into_iter() - .map(MergedGeneration::try_from) - .collect::>()?, - index_catchup: details - .index_catchup - .into_iter() - .map(IndexCatchupProgress::try_from) - .collect::>()?, - writer_config_defaults: details.writer_config_defaults, - }) - } -} - -/// MemWAL Index provides access to MemWAL configuration and state. -#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)] -pub struct MemWalIndex { - pub details: MemWalIndexDetails, -} - -impl MemWalIndex { - pub fn new(details: MemWalIndexDetails) -> Self { - Self { details } - } - - pub fn merged_generation_for_shard(&self, shard_id: &Uuid) -> Option { - self.details - .merged_generations - .iter() - .find(|mg| &mg.shard_id == shard_id) - .map(|mg| mg.generation) - } - - /// Get the caught up generation for a specific index and shard. - /// Returns None if the index is not tracked (assumed fully caught up). - pub fn index_caught_up_generation(&self, index_name: &str, shard_id: &Uuid) -> Option { - self.details - .index_catchup - .iter() - .find(|icp| icp.index_name == index_name) - .and_then(|icp| icp.caught_up_generation_for_shard(shard_id)) - } - - /// Check if an index is fully caught up for a shard. - /// Returns true if the index covers all merged data for the shard. - pub fn is_index_caught_up(&self, index_name: &str, shard_id: &Uuid) -> bool { - let merged_gen = self.merged_generation_for_shard(shard_id).unwrap_or(0); - let caught_up_gen = self.index_caught_up_generation(index_name, shard_id); - - // If not tracked in index_catchup, assumed fully caught up - caught_up_gen.is_none_or(|generation| generation >= merged_gen) - } -} +use crate::{Index, IndexType}; #[derive(Serialize)] struct MemWalStatistics { diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index af37f982d1c..13057658e92 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -7,7 +7,6 @@ //! It is a space-efficient data structure that can be used to test whether an element is a member of a set. //! It's an inexact filter - they may include false positives that require rechecking. -use crate::scalar::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use crate::scalar::expression::{BloomFilterQueryParser, ScalarQueryParser}; use crate::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, @@ -17,10 +16,10 @@ use crate::scalar::{ }; use crate::{Any, pb}; use arrow_array::{Array, UInt64Array}; -mod as_bytes; -pub mod sbbf; use arrow_schema::{DataType, Field}; use lance_arrow_stats::StatisticsAccumulator; +use lance_core::utils::bloomfilter::as_bytes; +use lance_core::utils::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use serde::{Deserialize, Serialize}; use std::sync::LazyLock; diff --git a/rust/lance-table/src/lib.rs b/rust/lance-table/src/lib.rs index ebe892ba534..89b424adc61 100644 --- a/rust/lance-table/src/lib.rs +++ b/rust/lance-table/src/lib.rs @@ -5,4 +5,5 @@ pub mod feature_flags; pub mod format; pub mod io; pub mod rowids; +pub mod system_index; pub mod utils; diff --git a/rust/lance-table/src/system_index.rs b/rust/lance-table/src/system_index.rs new file mode 100644 index 00000000000..021c01a5e52 --- /dev/null +++ b/rust/lance-table/src/system_index.rs @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! System indices: table-level structure persisted as indices. +//! +//! Unlike normal indices, whose internals stay opaque behind +//! [`crate::format::IndexMetadata::index_details`], the table format genuinely +//! interprets the contents of these indices (fragment remapping, row +//! visibility). They therefore live at the table layer. +//! +//! The `Index`-trait adapters for these structs live in `lance-index`, which +//! re-exports the structs defined here. + +pub mod frag_reuse; +pub mod mem_wal; diff --git a/rust/lance-table/src/system_index/frag_reuse.rs b/rust/lance-table/src/system_index/frag_reuse.rs new file mode 100644 index 00000000000..141f35688d4 --- /dev/null +++ b/rust/lance-table/src/system_index/frag_reuse.rs @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::{collections::HashMap, sync::Arc}; + +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_array::{Array, ArrayRef, PrimitiveArray, RecordBatch, UInt64Array}; +use deepsize::{Context, DeepSizeOf}; +use lance_core::{Error, Result}; +use lance_select::RowAddrTreeMap; +use roaring::{RoaringBitmap, RoaringTreemap}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::format::pb::fragment_reuse_index_details::InlineContent; +use crate::format::{ExternalFile, Fragment, pb}; + +pub const FRAG_REUSE_INDEX_NAME: &str = "__lance_frag_reuse"; +pub const FRAG_REUSE_DETAILS_FILE_NAME: &str = "details.binpb"; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragDigest { + pub id: u64, + pub physical_rows: usize, + pub num_deleted_rows: usize, +} + +impl From<&FragDigest> for pb::fragment_reuse_index_details::FragmentDigest { + fn from(digest: &FragDigest) -> Self { + Self { + id: digest.id, + physical_rows: digest.physical_rows as u64, + num_deleted_rows: digest.num_deleted_rows as u64, + } + } +} + +impl From<&Fragment> for FragDigest { + fn from(fragment: &Fragment) -> Self { + Self { + id: fragment.id, + physical_rows: fragment + .physical_rows + .expect("Fragment doesn't have physical rows recorded"), + num_deleted_rows: fragment + .deletion_file + .as_ref() + .and_then(|d| d.num_deleted_rows) + .unwrap_or(0), + } + } +} + +impl TryFrom for FragDigest { + type Error = Error; + + fn try_from(digest: pb::fragment_reuse_index_details::FragmentDigest) -> Result { + Ok(Self { + id: digest.id, + physical_rows: digest.physical_rows as usize, + num_deleted_rows: digest.num_deleted_rows as usize, + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseGroup { + pub changed_row_addrs: Vec, + pub old_frags: Vec, + pub new_frags: Vec, +} + +impl From<&FragReuseGroup> for pb::fragment_reuse_index_details::Group { + fn from(group: &FragReuseGroup) -> Self { + Self { + changed_row_addrs: group.changed_row_addrs.clone(), + old_fragments: group.old_frags.iter().map(|f| f.into()).collect(), + new_fragments: group.new_frags.iter().map(|f| f.into()).collect(), + } + } +} + +impl TryFrom for FragReuseGroup { + type Error = Error; + + fn try_from(group: pb::fragment_reuse_index_details::Group) -> Result { + Ok(Self { + changed_row_addrs: group.changed_row_addrs, + old_frags: group + .old_fragments + .into_iter() + .map(FragDigest::try_from) + .collect::>()?, + new_frags: group + .new_fragments + .into_iter() + .map(FragDigest::try_from) + .collect::>()?, + }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseVersion { + pub dataset_version: u64, + pub groups: Vec, +} + +impl From<&FragReuseVersion> for pb::fragment_reuse_index_details::Version { + fn from(version: &FragReuseVersion) -> Self { + Self { + dataset_version: version.dataset_version, + groups: version.groups.iter().map(|g| g.into()).collect(), + } + } +} + +impl TryFrom for FragReuseVersion { + type Error = Error; + + fn try_from(version: pb::fragment_reuse_index_details::Version) -> Result { + Ok(Self { + dataset_version: version.dataset_version, + groups: version + .groups + .into_iter() + .map(FragReuseGroup::try_from) + .collect::>()?, + }) + } +} + +impl FragReuseVersion { + pub fn old_frag_ids(&self) -> Vec { + self.groups + .iter() + .flat_map(|g| g.old_frags.iter().map(|f| f.id)) + .collect::>() + } + + pub fn new_frag_ids(&self) -> Vec { + self.groups + .iter() + .flat_map(|g| g.new_frags.iter().map(|f| f.id)) + .collect::>() + } + + pub fn new_frag_bitmap(&self) -> RoaringBitmap { + RoaringBitmap::from_iter(self.new_frag_ids().iter().map(|&id| id as u32)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub enum FragReuseIndexDetailsContentType { + Inline(FragReuseIndexDetails), + External(ExternalFile), +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FragReuseIndexDetails { + pub versions: Vec, +} + +impl From<&FragReuseIndexDetails> for InlineContent { + fn from(details: &FragReuseIndexDetails) -> Self { + let mut versions: Vec = + details.versions.iter().map(|m| m.into()).collect(); + // sort from oldest to latest version + versions.sort_by_key(|v| v.dataset_version); + Self { versions } + } +} + +impl TryFrom for FragReuseIndexDetails { + type Error = Error; + + fn try_from(content: InlineContent) -> Result { + Ok(Self { + versions: content + .versions + .into_iter() + .map(|m| m.try_into()) + .collect::>>()?, + }) + } +} + +impl FragReuseIndexDetails { + pub fn new_frag_bitmap(&self) -> RoaringBitmap { + RoaringBitmap::from_iter( + self.versions + .iter() + .flat_map(|v| v.new_frag_ids().into_iter().map(|id| id as u32)), + ) + } +} + +/// An index that stores row ID maps. +/// A row ID map describes the mapping from old row address to new address after compactions. +/// Each version contains the mapping for one round of compaction. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct FragReuseIndex { + pub uuid: Uuid, + pub row_id_maps: Vec>>, + pub details: FragReuseIndexDetails, +} + +impl DeepSizeOf for FragReuseIndex { + fn deep_size_of_children(&self, cx: &mut Context) -> usize { + self.row_id_maps.deep_size_of_children(cx) + self.details.deep_size_of_children(cx) + } +} + +impl FragReuseIndex { + pub fn new( + uuid: Uuid, + row_id_maps: Vec>>, + details: FragReuseIndexDetails, + ) -> Self { + Self { + uuid, + row_id_maps, + details, + } + } + + pub fn remap_row_id(&self, row_id: u64) -> Option { + let mut mapped_value = Some(row_id); + for row_id_map in self.row_id_maps.iter() { + if mapped_value.is_some() { + mapped_value = row_id_map + .get(&mapped_value.unwrap()) + .copied() + .unwrap_or(mapped_value); + } + } + + mapped_value + } + + pub fn remap_row_addrs_tree_map(&self, row_addrs: &RowAddrTreeMap) -> RowAddrTreeMap { + RowAddrTreeMap::from_iter(row_addrs.row_addrs().unwrap().filter_map(|addr| { + let addr_as_u64 = u64::from(addr); + self.remap_row_id(addr_as_u64) + })) + } + + pub fn remap_row_ids_roaring_tree_map(&self, row_ids: &RoaringTreemap) -> RoaringTreemap { + RoaringTreemap::from_iter(row_ids.iter().filter_map(|addr| self.remap_row_id(addr))) + } + + /// Remap a record batch that contains a row_id column at index `row_id_idx` + /// Currently this assumes there are only 2 columns in the schema, + /// which is the case for all indexes. + /// For example, for btree, the schema is (value, row_id). + /// For vector index storage, the schema is (row_id, vector). + pub fn remap_row_ids_record_batch( + &self, + batch: RecordBatch, + row_id_idx: usize, + ) -> Result { + assert_eq!(batch.schema().fields().len(), 2); + let other_column_idx = 1 - row_id_idx; + let row_ids = batch.column(row_id_idx).as_primitive::(); + let (val_indices, new_row_ids): (Vec, Vec) = row_ids + .values() + .iter() + .enumerate() + .filter_map(|(idx, old_id)| { + self.remap_row_id(*old_id) + .map(|new_id| (idx as u64, new_id)) + }) + .unzip(); + let new_val_indices = UInt64Array::from_iter_values(val_indices); + let new_vals = + arrow::compute::take(batch.column(other_column_idx), &new_val_indices, None)?; + + let mut batch_data: Vec<(usize, ArrayRef)> = vec![ + ( + row_id_idx, + Arc::new(UInt64Array::from_iter_values(new_row_ids)) as ArrayRef, + ), + (other_column_idx, Arc::new(new_vals)), + ]; + batch_data.sort_by_key(|(i, _)| *i); + Ok(RecordBatch::try_new( + batch.schema(), + batch_data.into_iter().map(|(_, item)| item).collect(), + )?) + } + + pub fn remap_row_ids_array(&self, array: ArrayRef) -> PrimitiveArray { + let primitive_array = array + .as_any() + .downcast_ref::>() + .expect("expected row IDs to be uint64 array"); + (0..primitive_array.len()) + .map(|i| { + if primitive_array.is_null(i) { + None + } else { + self.remap_row_id(primitive_array.value(i)) + } + }) + .collect() + } + + pub fn remap_fragment_bitmap(&self, fragment_bitmap: &mut RoaringBitmap) -> Result<()> { + for version in self.details.versions.iter() { + for group in version.groups.iter() { + let mut removed = 0; + for old_frag in group.old_frags.iter() { + if fragment_bitmap.remove(old_frag.id as u32) { + removed += 1; + } + } + + if removed > 0 { + if removed != group.old_frags.len() { + // Straddle: the index covered only part of this rewrite + // group. Caused by the bug fixed in + // . + // We've already removed the indexed old_frags from the + // bitmap above; deliberately do NOT insert new_frags, + // since the merged fragment also contains rows that + // were never indexed. Affected rows fall through to + // flat scan until the next optimize_indices. The fix + // is persisted on the next write via build_manifest. + tracing::warn!( + "Healing straddling fragment-reuse rewrite group in index bitmap: \ + group {:?} was only partially indexed ({} of {} old fragments). \ + Affected rows will use flat scan until the next optimize_indices.", + group.old_frags, + removed, + group.old_frags.len(), + ); + continue; + } + + for new_frag in group.new_frags.iter() { + fragment_bitmap.insert(new_frag.id as u32); + } + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[tokio::test] + async fn test_serialize_deserialize_index_details() { + // Create sample FragReuseVersions with different dataset versions + let version1 = FragReuseVersion { + dataset_version: 2, + groups: vec![FragReuseGroup { + changed_row_addrs: vec![1, 2, 3], + old_frags: vec![FragDigest { + id: 1, + physical_rows: 1, + num_deleted_rows: 0, + }], + new_frags: vec![ + FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 3, + physical_rows: 1, + num_deleted_rows: 0, + }, + ], + }], + }; + + let version2 = FragReuseVersion { + dataset_version: 1, + groups: vec![FragReuseGroup { + changed_row_addrs: vec![4, 5, 6], + old_frags: vec![FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }], + new_frags: vec![ + FragDigest { + id: 4, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 5, + physical_rows: 1, + num_deleted_rows: 0, + }, + ], + }], + }; + + // Create FragReuseIndexDetails with versions in reverse order + let details = FragReuseIndexDetails { + versions: vec![version1, version2], + }; + + // Convert to protobuf format + let inline_content: InlineContent = (&details).into(); + + // Convert back to FragReuseIndexDetails + let roundtrip_details = FragReuseIndexDetails::try_from(inline_content).unwrap(); + + // Verify the roundtrip + assert_eq!(roundtrip_details.versions.len(), 2); + + // Verify versions are sorted by dataset_version (oldest to latest) + assert_eq!(roundtrip_details.versions[0].dataset_version, 1); + assert_eq!( + roundtrip_details.versions[0].groups[0].changed_row_addrs, + vec![4, 5, 6] + ); + assert_eq!( + roundtrip_details.versions[0].groups[0].new_frags, + vec![ + FragDigest { + id: 4, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 5, + physical_rows: 1, + num_deleted_rows: 0, + } + ] + ); + assert_eq!( + roundtrip_details.versions[0].groups[0].old_frags, + vec![FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }] + ); + + assert_eq!(roundtrip_details.versions[1].dataset_version, 2); + assert_eq!( + roundtrip_details.versions[1].groups[0].changed_row_addrs, + vec![1, 2, 3] + ); + assert_eq!( + roundtrip_details.versions[1].groups[0].new_frags, + vec![ + FragDigest { + id: 2, + physical_rows: 1, + num_deleted_rows: 0, + }, + FragDigest { + id: 3, + physical_rows: 1, + num_deleted_rows: 0, + } + ] + ); + assert_eq!( + roundtrip_details.versions[1].groups[0].old_frags, + vec![FragDigest { + id: 1, + physical_rows: 1, + num_deleted_rows: 0, + }] + ); + } +} diff --git a/rust/lance-table/src/system_index/mem_wal.rs b/rust/lance-table/src/system_index/mem_wal.rs new file mode 100644 index 00000000000..9b42cf90e6c --- /dev/null +++ b/rust/lance-table/src/system_index/mem_wal.rs @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::collections::HashMap; + +use deepsize::DeepSizeOf; +use lance_core::Error; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::format::pb; + +pub const MEM_WAL_INDEX_NAME: &str = "__lance_mem_wal"; + +/// Type alias for shard identifier (UUID v4). +pub type ShardId = Uuid; + +/// A flushed MemTable generation and its storage location. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct FlushedGeneration { + pub generation: u64, + pub path: String, +} + +impl From<&FlushedGeneration> for pb::FlushedGeneration { + fn from(fg: &FlushedGeneration) -> Self { + Self { + generation: fg.generation, + path: fg.path.clone(), + } + } +} + +impl From for FlushedGeneration { + fn from(fg: pb::FlushedGeneration) -> Self { + Self { + generation: fg.generation, + path: fg.path, + } + } +} + +/// A shard's merged generation, used in MemWalIndexDetails. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash, Serialize, Deserialize)] +pub struct MergedGeneration { + pub shard_id: Uuid, + pub generation: u64, +} + +impl DeepSizeOf for MergedGeneration { + fn deep_size_of_children(&self, _context: &mut deepsize::Context) -> usize { + 0 // UUID is 16 bytes fixed size, no heap allocations + } +} + +impl MergedGeneration { + pub fn new(shard_id: Uuid, generation: u64) -> Self { + Self { + shard_id, + generation, + } + } +} + +impl From<&MergedGeneration> for pb::MergedGeneration { + fn from(mg: &MergedGeneration) -> Self { + Self { + shard_id: Some((&mg.shard_id).into()), + generation: mg.generation, + } + } +} + +impl TryFrom for MergedGeneration { + type Error = Error; + + fn try_from(mg: pb::MergedGeneration) -> lance_core::Result { + let shard_id = mg + .shard_id + .as_ref() + .map(Uuid::try_from) + .ok_or_else(|| Error::invalid_input("Missing shard_id in MergedGeneration"))??; + Ok(Self { + shard_id, + generation: mg.generation, + }) + } +} + +/// Tracks which merged generation a base table index has been rebuilt to cover. +/// Used to determine whether to read from flushed MemTable indexes or base table. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct IndexCatchupProgress { + pub index_name: String, + pub caught_up_generations: Vec, +} + +impl IndexCatchupProgress { + pub fn new(index_name: String, caught_up_generations: Vec) -> Self { + Self { + index_name, + caught_up_generations, + } + } + + /// Get the caught up generation for a specific shard. + /// Returns None if the shard is not present (assumed fully caught up). + pub fn caught_up_generation_for_shard(&self, shard_id: &Uuid) -> Option { + self.caught_up_generations + .iter() + .find(|mg| &mg.shard_id == shard_id) + .map(|mg| mg.generation) + } +} + +impl From<&IndexCatchupProgress> for pb::IndexCatchupProgress { + fn from(icp: &IndexCatchupProgress) -> Self { + Self { + index_name: icp.index_name.clone(), + caught_up_generations: icp + .caught_up_generations + .iter() + .map(|mg| mg.into()) + .collect(), + } + } +} + +impl TryFrom for IndexCatchupProgress { + type Error = Error; + + fn try_from(icp: pb::IndexCatchupProgress) -> lance_core::Result { + Ok(Self { + index_name: icp.index_name, + caught_up_generations: icp + .caught_up_generations + .into_iter() + .map(MergedGeneration::try_from) + .collect::>()?, + }) + } +} + +/// Shard manifest containing epoch-based fencing and WAL state. +/// Each shard has exactly one active writer at any time. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ShardManifest { + pub shard_id: Uuid, + pub version: u64, + pub shard_spec_id: u32, + /// Computed shard field values as raw Arrow scalar bytes, keyed by field id. + /// The byte encoding follows Arrow's little-endian convention: int32 is 4 LE + /// bytes, utf8 is raw UTF-8 bytes, etc. The result_type in the corresponding + /// ShardingField from the ShardingSpec determines how to interpret each value. + pub shard_field_values: HashMap>, + pub writer_epoch: u64, + /// The most recent WAL entry position flushed to a MemTable. + /// Recovery replays from `replay_after_wal_entry_position + 1`. The + /// default value 0 means "no flush has ever stamped this shard" — WAL + /// positions themselves are 1-based, so 0 is never a valid covered + /// position. + pub replay_after_wal_entry_position: u64, + /// The most recent WAL entry position observed at manifest write time. + /// Default 0 means "no entry has been written yet"; WAL positions are + /// 1-based. + pub wal_entry_position_last_seen: u64, + pub current_generation: u64, + pub flushed_generations: Vec, +} + +impl DeepSizeOf for ShardManifest { + fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + self.shard_field_values.deep_size_of_children(context) + + self.flushed_generations.deep_size_of_children(context) + } +} + +impl From<&ShardManifest> for pb::ShardManifest { + fn from(rm: &ShardManifest) -> Self { + Self { + shard_id: Some((&rm.shard_id).into()), + version: rm.version, + shard_spec_id: rm.shard_spec_id, + shard_field_entries: rm + .shard_field_values + .iter() + .map(|(k, v)| pb::ShardFieldEntry { + field_id: k.clone(), + value: v.clone(), + }) + .collect(), + writer_epoch: rm.writer_epoch, + replay_after_wal_entry_position: rm.replay_after_wal_entry_position, + wal_entry_position_last_seen: rm.wal_entry_position_last_seen, + current_generation: rm.current_generation, + flushed_generations: rm.flushed_generations.iter().map(|fg| fg.into()).collect(), + } + } +} + +impl TryFrom for ShardManifest { + type Error = Error; + + fn try_from(rm: pb::ShardManifest) -> lance_core::Result { + let shard_id = rm + .shard_id + .as_ref() + .map(Uuid::try_from) + .ok_or_else(|| Error::invalid_input("Missing shard_id in ShardManifest"))??; + let shard_field_values = rm + .shard_field_entries + .into_iter() + .map(|e| (e.field_id, e.value)) + .collect(); + Ok(Self { + shard_id, + version: rm.version, + shard_spec_id: rm.shard_spec_id, + shard_field_values, + writer_epoch: rm.writer_epoch, + replay_after_wal_entry_position: rm.replay_after_wal_entry_position, + wal_entry_position_last_seen: rm.wal_entry_position_last_seen, + current_generation: rm.current_generation, + flushed_generations: rm + .flushed_generations + .into_iter() + .map(FlushedGeneration::from) + .collect(), + }) + } +} + +/// Sharding field definition. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct ShardingField { + pub field_id: String, + pub source_ids: Vec, + pub transform: Option, + pub expression: Option, + pub result_type: String, + pub parameters: HashMap, +} + +impl From<&ShardingField> for pb::ShardingField { + fn from(rf: &ShardingField) -> Self { + Self { + field_id: rf.field_id.clone(), + source_ids: rf.source_ids.clone(), + transform: rf.transform.clone(), + expression: rf.expression.clone(), + result_type: rf.result_type.clone(), + parameters: rf.parameters.clone(), + } + } +} + +impl From for ShardingField { + fn from(rf: pb::ShardingField) -> Self { + Self { + field_id: rf.field_id, + source_ids: rf.source_ids, + transform: rf.transform, + expression: rf.expression, + result_type: rf.result_type, + parameters: rf.parameters, + } + } +} + +/// Sharding spec definition. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct ShardingSpec { + pub spec_id: u32, + pub fields: Vec, +} + +impl From<&ShardingSpec> for pb::ShardingSpec { + fn from(rs: &ShardingSpec) -> Self { + Self { + spec_id: rs.spec_id, + fields: rs.fields.iter().map(|f| f.into()).collect(), + } + } +} + +impl From for ShardingSpec { + fn from(rs: pb::ShardingSpec) -> Self { + Self { + spec_id: rs.spec_id, + fields: rs.fields.into_iter().map(ShardingField::from).collect(), + } + } +} + +/// Index details for MemWAL Index, stored in IndexMetadata.index_details. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)] +pub struct MemWalIndexDetails { + pub snapshot_ts_millis: i64, + pub num_shards: u32, + pub inline_snapshots: Option>, + pub sharding_specs: Vec, + pub maintained_indexes: Vec, + pub merged_generations: Vec, + pub index_catchup: Vec, + /// Default `ShardWriter` configuration values for this MemWAL index. + /// + /// Persisted so every writer — across processes and restarts — starts + /// from the same default writer configuration. These are defaults only; + /// an individual writer may still override any value at runtime in its + /// own (non-persisted) `ShardWriterConfig`. + pub writer_config_defaults: HashMap, +} + +impl From<&MemWalIndexDetails> for pb::MemWalIndexDetails { + fn from(details: &MemWalIndexDetails) -> Self { + Self { + snapshot_ts_millis: details.snapshot_ts_millis, + num_shards: details.num_shards, + inline_snapshots: details.inline_snapshots.clone(), + sharding_specs: details.sharding_specs.iter().map(|rs| rs.into()).collect(), + maintained_indexes: details.maintained_indexes.clone(), + merged_generations: details + .merged_generations + .iter() + .map(|mg| mg.into()) + .collect(), + index_catchup: details.index_catchup.iter().map(|icp| icp.into()).collect(), + writer_config_defaults: details.writer_config_defaults.clone(), + } + } +} + +impl TryFrom for MemWalIndexDetails { + type Error = Error; + + fn try_from(details: pb::MemWalIndexDetails) -> lance_core::Result { + Ok(Self { + snapshot_ts_millis: details.snapshot_ts_millis, + num_shards: details.num_shards, + inline_snapshots: details.inline_snapshots, + sharding_specs: details + .sharding_specs + .into_iter() + .map(ShardingSpec::from) + .collect(), + maintained_indexes: details.maintained_indexes, + merged_generations: details + .merged_generations + .into_iter() + .map(MergedGeneration::try_from) + .collect::>()?, + index_catchup: details + .index_catchup + .into_iter() + .map(IndexCatchupProgress::try_from) + .collect::>()?, + writer_config_defaults: details.writer_config_defaults, + }) + } +} + +/// MemWAL Index provides access to MemWAL configuration and state. +#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)] +pub struct MemWalIndex { + pub details: MemWalIndexDetails, +} + +impl MemWalIndex { + pub fn new(details: MemWalIndexDetails) -> Self { + Self { details } + } + + pub fn merged_generation_for_shard(&self, shard_id: &Uuid) -> Option { + self.details + .merged_generations + .iter() + .find(|mg| &mg.shard_id == shard_id) + .map(|mg| mg.generation) + } + + /// Get the caught up generation for a specific index and shard. + /// Returns None if the index is not tracked (assumed fully caught up). + pub fn index_caught_up_generation(&self, index_name: &str, shard_id: &Uuid) -> Option { + self.details + .index_catchup + .iter() + .find(|icp| icp.index_name == index_name) + .and_then(|icp| icp.caught_up_generation_for_shard(shard_id)) + } + + /// Check if an index is fully caught up for a shard. + /// Returns true if the index covers all merged data for the shard. + pub fn is_index_caught_up(&self, index_name: &str, shard_id: &Uuid) -> bool { + let merged_gen = self.merged_generation_for_shard(shard_id).unwrap_or(0); + let caught_up_gen = self.index_caught_up_generation(index_name, shard_id); + + // If not tracked in index_catchup, assumed fully caught up + caught_up_gen.is_none_or(|generation| generation >= merged_gen) + } +} diff --git a/rust/lance/src/dataset/mem_wal/memtable.rs b/rust/lance/src/dataset/mem_wal/memtable.rs index fd23fd6d577..cb95a4ab531 100644 --- a/rust/lance/src/dataset/mem_wal/memtable.rs +++ b/rust/lance/src/dataset/mem_wal/memtable.rs @@ -14,8 +14,8 @@ use std::time::{Duration, Instant}; use arrow_array::{Array, RecordBatch, RecordBatchIterator}; use arrow_schema::Schema as ArrowSchema; use lance_core::datatypes::Schema; +use lance_core::utils::bloomfilter::sbbf::Sbbf; use lance_core::{Error, Result}; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; use tokio::sync::RwLock; use tracing::instrument; use uuid::Uuid; diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index 8f0e34db5ec..c4794d4c8f3 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -332,7 +332,7 @@ impl MemTableFlusher { async fn write_bloom_filter( &self, path: &Path, - bloom: &lance_index::scalar::bloomfilter::sbbf::Sbbf, + bloom: &lance_core::utils::bloomfilter::sbbf::Sbbf, ) -> Result<()> { let data = bloom.to_bytes(); self.object_store diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs index 6039eed1629..632b08a753f 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/bloom_guard.rs @@ -21,7 +21,7 @@ use datafusion::physical_plan::{ SendableRecordBatchStream, }; use futures::Stream; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; +use lance_core::utils::bloomfilter::sbbf::Sbbf; /// Guards a child execution node with a bloom filter check. /// diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index a6063c2930c..d1353e72dcc 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -17,9 +17,9 @@ use datafusion::physical_plan::limit::GlobalLimitExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::prelude::{Expr, SessionContext}; use futures::TryStreamExt; +use lance_core::utils::bloomfilter::sbbf::Sbbf; use lance_core::{Result, is_system_column}; use lance_datafusion::exec::OneShotExec; -use lance_index::scalar::bloomfilter::sbbf::Sbbf; use tracing::instrument; use crate::dataset::mem_wal::index::IndexStore; diff --git a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs index 89ec893705e..a0a7f93b653 100644 --- a/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs +++ b/rust/lance/src/dataset/write/merge_insert/inserted_rows.rs @@ -15,7 +15,7 @@ use arrow_array::{ use arrow_schema::DataType; use deepsize::DeepSizeOf; use lance_core::Result; -use lance_index::scalar::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; +use lance_core::utils::bloomfilter::sbbf::{Sbbf, SbbfBuilder}; use lance_table::format::pb; // Default bloom filter config: 8192 items @ 0.00057 fpp -> 16KiB filter From 8d0325896b1dccdec5ef3cfefaac6710ef444a14 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 5 Jun 2026 15:47:53 -0700 Subject: [PATCH 043/177] fix(python)!: derive index type from details instead of opening the index (#6903) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING CHANGE: `describe_indices()` now reports nested and special-character field names as full field paths (e.g. `meta.lang`, `` `user-id` ``) instead of just the leaf name. `list_indices()` called the `load_indices()` binding, which opened each index to derive its type and reported `"Unknown"` on any failure. `list_indices()` is now a thin wrapper over `describe_indices()`, which derives the type from index details without opening the index: - `describe_indices()` no longer errors on indices without index details; it returns a best-effort degraded entry instead. - When index details exist but no plugin is registered for the type URL, the type is derived from the type URL rather than `"Unknown"`. - `field_names` now uses the full field path, so nested fields are reported as dotted paths instead of just the leaf name. - `IndexSegmentDescription` gains a `base_id` field. - The unused `load_indices()` Python binding is removed. The `list_indices()` return type hint was incorrect (`List[Index]` — the method has always returned dicts). It now returns a typed `IndexInformation` `TypedDict`, so callers get key and value types instead of an opaque dict. ## Testing - Rust: `cargo test -p lance --lib index::`, `lance-index` registry tests — new tests cover the degraded entry and the type-URL fallback. - Python: `test_scalar_index.py`, `test_column_names.py`, `test_vector_index.py`, `test_optimize.py` — including new `list_indices()` characterization tests committed before the rework, plus index-without-details and legacy-vector cases. - Lint: `cargo fmt`, `cargo clippy` (lance, lance-index, pylance), `ruff`, `pyright`. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- python/python/lance/dataset.py | 37 +++- python/python/lance/lance/__init__.pyi | 1 - .../python/lance/lance/indices/__init__.pyi | 1 + python/python/tests/test_column_names.py | 10 +- python/python/tests/test_optimize.py | 8 +- python/python/tests/test_scalar_index.py | 156 +++++++++++++- python/python/tests/test_vector_index.py | 4 +- python/src/dataset.rs | 83 +------- python/src/indices.rs | 17 +- rust/lance-index/src/registry.rs | 33 +++ rust/lance/src/index.rs | 192 ++++++++++++------ 11 files changed, 376 insertions(+), 166 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ccd44e6dcaf..38796e97439 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -958,14 +958,14 @@ def checkout_latest(self): """Check out the latest version of the current branch.""" self._ds.checkout_latest() - def list_indices(self) -> List[Index]: + def list_indices(self) -> List[IndexInformation]: """ Returns index information for all indices in the dataset. - This method is deprecated as it requires loading the statistics for each index - which can be a very expensive operation. Instead use describe_indices() to - list index information and index_statistics() to get the statistics for - individual indexes of interest. + This method is deprecated. Use describe_indices() instead, which returns + richer per-index information. + + Each returned :class:`IndexInformation` describes one index segment. """ warnings.warn( "The 'list_indices' method is deprecated. It may be removed in a future " @@ -973,7 +973,19 @@ def list_indices(self) -> List[Index]: DeprecationWarning, ) - return self._ds.load_indices() + return [ + { + "name": desc.name, + "type": desc.index_type, + "uuid": segment.uuid, + "fields": desc.field_names, + "version": segment.dataset_version_at_last_update, + "fragment_ids": segment.fragment_ids, + "base_id": segment.base_id, + } + for desc in self.describe_indices() + for segment in desc.segments + ] def describe_indices(self) -> List[IndexDescription]: """Returns index information for all indices in the dataset.""" @@ -5360,6 +5372,19 @@ class Index: index_details: Optional[Tuple[str, bytes]] = None +class IndexInformation(TypedDict): + """Information about a single index segment, as returned by + :meth:`LanceDataset.list_indices`.""" + + name: str + type: str + uuid: str + fields: List[str] + version: int + fragment_ids: Set[int] + base_id: Optional[int] + + class AutoCleanupConfig(TypedDict): interval: int older_than_seconds: int diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 748698d169b..38d82738063 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -228,7 +228,6 @@ class _Dataset: def data_storage_version(self) -> str: ... def index_statistics(self, index_name: str) -> str: ... def serialized_manifest(self) -> bytes: ... - def load_indices(self) -> List[Index]: ... def describe_indices(self) -> List[IndexDescription]: ... def scanner( self, diff --git a/python/python/lance/lance/indices/__init__.pyi b/python/python/lance/lance/indices/__init__.pyi index d28bdd123fc..0f5db7037df 100644 --- a/python/python/lance/lance/indices/__init__.pyi +++ b/python/python/lance/lance/indices/__init__.pyi @@ -72,6 +72,7 @@ class IndexSegmentDescription: index_version: int created_at: Optional[datetime] size_bytes: Optional[int] + base_id: Optional[int] def __repr__(self) -> str: ... diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index f7b5962b523..d402ba5bcbb 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -349,7 +349,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): indices = special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["user-id"] + assert indices[0].field_names == ["`user-id`"] assert indices[0].name == "user-id_idx" # Query using the indexed column (requires backticks in filter) @@ -462,7 +462,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.describe_indices() assert len(indices) == 1 assert indices[0].name == "MetaData.userId_idx" - assert indices[0].field_names == ["userId"] + assert indices[0].field_names == ["MetaData.userId"] # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -512,7 +512,7 @@ def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_datase assert len(indices) == 1 # Should store with correct case from schema assert indices[0].name == "MetaData.userId_idx" - assert indices[0].field_names == ["userId"] + assert indices[0].field_names == ["MetaData.userId"] # Query should also work with correct case result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -576,7 +576,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["user-id"] + assert indices[0].field_names == ["`meta-data`.`user-id`"] assert indices[0].name == "meta-data.user-id_idx" # Query using the indexed column (backticks required in filter) @@ -600,7 +600,7 @@ def test_scalar_index_on_top_level_special_chars(self, nested_special_char_datas indices = nested_special_char_dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["row-id"] + assert indices[0].field_names == ["`row-id`"] result = nested_special_char_dataset.to_table(filter="`row-id` = 50") assert result.num_rows == 1 diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 801efcbd4f2..ccd889db116 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -330,11 +330,9 @@ def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path): string for every index, including the __lance_frag_reuse system index that defer_index_remap produces. - list_indices() special-cases system indices via infer_system_index_type() - in python/src/dataset.rs. describe_indices() in - rust/lance/src/index.rs::IndexDescriptionImpl::try_new does not, so it - falls through to a plugin lookup that has no entry for - FragmentReuseIndexDetails and reports 'Unknown' instead. + list_indices() is a wrapper over describe_indices(), so the two must stay + in sync. System indices are identified by name via infer_system_index_type() + in rust/lance/src/index.rs::IndexDescriptionImpl::try_new. """ base_dir = tmp_path / "dataset" data = pa.table({"i": range(6_000), "val": range(6_000)}) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 59654d848df..5c36c3bd91a 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -147,7 +147,7 @@ def btree_comparison_datasets(tmp_path): } -def test_load_indices(indexed_dataset: lance.LanceDataset): +def test_describe_indices_vector_and_scalar(indexed_dataset: lance.LanceDataset): indices = indexed_dataset.describe_indices() vec_idx = next(idx for idx in indices if "VectorIndex" in idx.type_url) scalar_idx = next(idx for idx in indices if idx.index_type == "BTree") @@ -155,6 +155,154 @@ def test_load_indices(indexed_dataset: lance.LanceDataset): assert scalar_idx is not None +def test_list_indices_characterization(indexed_dataset: lance.LanceDataset): + """Lock down the backwards-compatible shape of the deprecated list_indices(). + + list_indices() returns a list of plain dicts (one per index segment), not + Index dataclasses. This characterization test guards the dict keys and + values so the deprecated method stays backwards compatible. + """ + with pytest.warns(DeprecationWarning): + indices = indexed_dataset.list_indices() + + assert len(indices) == 2 + by_name = {idx["name"]: idx for idx in indices} + assert set(by_name) == {"vector_idx", "meta_idx"} + + expected_keys = { + "name", + "type", + "uuid", + "fields", + "version", + "fragment_ids", + "base_id", + } + for idx in indices: + assert set(idx) == expected_keys + assert isinstance(idx["uuid"], str) and len(idx["uuid"]) > 0 + assert isinstance(idx["fields"], list) + assert isinstance(idx["fragment_ids"], set) + assert isinstance(idx["version"], int) + assert idx["type"] != "Unknown" + assert idx["base_id"] is None + + vector_idx = by_name["vector_idx"] + assert vector_idx["type"] == "IVF_PQ" + assert vector_idx["fields"] == ["vector"] + assert vector_idx["fragment_ids"] == {0} + + meta_idx = by_name["meta_idx"] + assert meta_idx["type"] == "BTree" + assert meta_idx["fields"] == ["meta"] + assert meta_idx["fragment_ids"] == {0} + + +def test_list_indices_nested_field_path(tmp_path): + """list_indices() reports nested fields as full dotted paths.""" + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("meta", pa.struct([pa.field("lang", pa.string())])), + ] + ) + data = pa.table( + { + "id": [1, 2, 3], + "meta": [{"lang": "en"}, {"lang": "fr"}, {"lang": "en"}], + }, + schema=schema, + ) + ds = lance.write_dataset(data, tmp_path) + ds.create_scalar_index(column="meta.lang", index_type="BTREE") + + with pytest.warns(DeprecationWarning): + indices = ds.list_indices() + + assert len(indices) == 1 + assert indices[0]["fields"] == ["meta.lang"] + + +def _commit_index(ds, index): + """Commit a single raw Index entry via the CreateIndex operation.""" + return lance.LanceDataset.commit( + ds.uri, + lance.LanceOperation.CreateIndex(new_indices=[index], removed_indices=[]), + read_version=ds.version, + ) + + +def test_list_indices_index_without_details(tmp_path): + """An index whose manifest entry has no index details (e.g. committed by an + older writer) is still reported on a best-effort basis: describe_indices() + does not error, and the type is reported as "Unknown".""" + from lance.dataset import Index + + data = pa.table({"id": range(100), "val": range(100)}) + ds = lance.write_dataset(data, tmp_path) + + field_id = ds.schema.get_field_index("id") + fragment_ids = {f.fragment_id for f in ds.get_fragments()} + ds = _commit_index( + ds, + Index( + uuid=str(uuid.uuid4()), + name="legacy_idx", + fields=[field_id], + dataset_version=ds.version, + fragment_ids=fragment_ids, + index_version=0, + ), + ) + + described = ds.describe_indices() + assert len(described) == 1 + assert described[0].name == "legacy_idx" + assert described[0].index_type == "Unknown" + assert described[0].type_url == "" + + with pytest.warns(DeprecationWarning): + listed = ds.list_indices() + assert len(listed) == 1 + assert listed[0]["name"] == "legacy_idx" + assert listed[0]["type"] == "Unknown" + + +def test_list_indices_legacy_vector_index_without_details(tmp_path): + """A legacy vector index predates VectorIndexDetails: it has no index + details but stores a monolithic index file. Its type is recognized as + "Vector" from the index file rather than reported as "Unknown".""" + from lance.dataset import Index, IndexFile + + data = pa.table({"id": range(100), "val": range(100)}) + ds = lance.write_dataset(data, tmp_path) + + field_id = ds.schema.get_field_index("id") + fragment_ids = {f.fragment_id for f in ds.get_fragments()} + ds = _commit_index( + ds, + Index( + uuid=str(uuid.uuid4()), + name="legacy_vector_idx", + fields=[field_id], + dataset_version=ds.version, + fragment_ids=fragment_ids, + index_version=0, + # "index.idx" is the legacy monolithic index file name; its presence + # is how a pre-details vector index is recognized. + files=[IndexFile(path="index.idx", size_bytes=0)], + ), + ) + + described = ds.describe_indices() + assert len(described) == 1 + assert described[0].index_type == "Vector" + + with pytest.warns(DeprecationWarning): + listed = ds.list_indices() + assert listed[0]["type"] == "Vector" + + def test_indexed_scalar_scan(indexed_dataset: lance.LanceDataset, data_table: pa.Table): sample_meta = data_table["meta"][50] expected_price = data_table["price"][50] @@ -4187,7 +4335,7 @@ def test_nested_field_btree_index(tmp_path): # Verify index was created indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["lang"] + assert indices[0].field_names == ["meta.lang"] assert indices[0].index_type == "BTree" # Test query using the index - filter for English language @@ -4288,7 +4436,7 @@ def test_nested_field_fts_index(tmp_path): # Verify index was created indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["text"] + assert indices[0].field_names == ["data.text"] assert indices[0].index_type == "Inverted" # Test full text search on nested field @@ -4362,7 +4510,7 @@ def test_nested_field_bitmap_index(tmp_path): # Verify index was created indices = ds.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["color"] + assert indices[0].field_names == ["attributes.color"] assert indices[0].index_type == "Bitmap" # Test equality query diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 8a3a85bf6e7..505d20798bb 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1822,7 +1822,7 @@ def test_fragment_scan_disallowed_on_ann_with_index_scan_prefilter(tmp_path): assert results == results_no_scalar_index -def test_load_indices(dataset): +def test_describe_indices(dataset): indices = dataset.describe_indices() assert len(indices) == 0 @@ -2166,7 +2166,7 @@ def test_nested_field_vector_index(tmp_path): # Verify index was created indices = dataset.describe_indices() assert len(indices) == 1 - assert indices[0].field_names == ["embedding"] + assert indices[0].field_names == ["data.embedding"] # Test querying with the index query_vec = vectors[0] diff --git a/python/src/dataset.rs b/python/src/dataset.rs index f9f6ed669b1..35589709075 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -24,7 +24,7 @@ use lance_index::vector::bq::storage::RabitQuantizationMetadata; use log::error; use object_store::path::Path; use pyo3::exceptions::{PyStopIteration, PyTypeError}; -use pyo3::types::{PyBytes, PyInt, PyList, PySet, PyString, PyTuple}; +use pyo3::types::{PyBytes, PyInt, PyList, PyString, PyTuple}; use pyo3::{IntoPyObjectExt, prelude::*}; use pyo3::{ PyResult, @@ -60,9 +60,7 @@ use lance::dataset::{ transaction::{Operation, Transaction}, }; use lance::index::vector::utils::get_vector_type; -use lance::index::{ - DatasetIndexExt, DatasetIndexInternalExt, IndexSegment, vector::VectorIndexParams, -}; +use lance::index::{DatasetIndexExt, IndexSegment, vector::VectorIndexParams}; use lance::{dataset::builder::DatasetBuilder, index::vector::IndexFileVersion}; use lance_arrow::as_fixed_size_list_array; use lance_core::Error; @@ -70,6 +68,7 @@ use lance_core::datatypes::BlobHandling; use lance_datafusion::utils::reader_to_stream; use lance_encoding::decoder::DecoderConfig; use lance_file::reader::FileReaderOptions; +use lance_index::scalar::inverted::query::Occur; use lance_index::scalar::inverted::query::{ BooleanQuery, BoostQuery, FtsQuery, MatchQuery, MultiMatchQuery, Operator, PhraseQuery, }; @@ -83,9 +82,6 @@ use lance_index::{ ivf::IvfBuildParams, pq::PQBuildParams, sq::builder::SQBuildParams, }, }; -use lance_index::{ - infer_system_index_type, metrics::NoOpMetricsCollector, scalar::inverted::query::Occur, -}; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStoreParams, StorageOptionsAccessor, }; @@ -953,79 +949,6 @@ impl Dataset { Ok(dict.into()) } - /// Load index metadata. - /// - /// This call will open the index and return its concrete index type. - fn load_indices(self_: PyRef<'_, Self>) -> PyResult>> { - let index_metadata = rt() - .block_on(Some(self_.py()), self_.ds.load_indices())? - .map_err(|err| PyValueError::new_err(err.to_string()))?; - let py = self_.py(); - index_metadata - .iter() - .map(|idx| { - let dict = PyDict::new(py); - let schema = self_.ds.schema(); - let field_paths = idx - .fields - .iter() - .map(|field_id| schema.field_path(*field_id).unwrap()) - .collect::>(); - - let ds = self_.ds.clone(); - let idx_type = match rt().block_on(Some(self_.py()), async { - if let Some(system_index_type) = infer_system_index_type(idx) { - Ok::<_, lance::Error>(system_index_type.to_string()) - } else { - let idx = ds - .open_generic_index( - &field_paths[0], - &idx.uuid.to_string(), - &NoOpMetricsCollector, - ) - .await?; - Ok::<_, lance::Error>(idx.index_type().to_string()) - } - })? { - Ok(r) => r, - Err(error) => { - log::warn!( - "Cannot derive index type for index {} (uuid={}, type_url={:?}, version={}) on dataset {}: {}", - idx.name, - idx.uuid, - idx.index_details.as_ref().map(|d| d.type_url.as_str()), - idx.index_version, - self_.ds.uri(), - error, - ); - // mark the type as unknown for any new index type - "Unknown".to_owned() - } - }; - - let fragment_set = PySet::empty(py).unwrap(); - if let Some(bitmap) = &idx.fragment_bitmap { - for fragment_id in bitmap.iter() { - fragment_set.add(fragment_id).unwrap(); - } - } - - dict.set_item("name", idx.name.clone()).unwrap(); - // TODO: once we add more than vector indices, we need to: - // 1. Change protos and write path to persist index type - // 2. Use the new field from idx instead of hard coding it to Vector - dict.set_item("type", idx_type).unwrap(); - dict.set_item("uuid", idx.uuid.to_string()).unwrap(); - dict.set_item("fields", field_paths).unwrap(); - dict.set_item("version", idx.dataset_version).unwrap(); - dict.set_item("fragment_ids", fragment_set).unwrap(); - dict.set_item("base_id", idx.base_id.map(|id| id as i64)) - .unwrap(); - dict.into_py_any(py) - }) - .collect::>>() - } - #[allow(clippy::too_many_arguments)] #[pyo3(signature=(columns=None, columns_with_transform=None, filter=None, search_filter=None, prefilter=None, limit=None, offset=None, nearest=None, batch_size=None, batch_size_bytes=None, io_buffer_size=None, batch_readahead=None, fragment_readahead=None, scan_in_order=None, fragments=None, index_segments=None, with_row_id=None, with_row_address=None, use_stats=None, substrait_filter=None, fast_search=None, full_text_query=None, late_materialization=None, blob_handling=None, use_scalar_index=None, include_deleted_rows=None, scan_stats_callback=None, strict_batch_size=None, order_by=None, disable_scoring_autoprojection=None, substrait_aggregate=None))] fn scanner( diff --git a/python/src/indices.rs b/python/src/indices.rs index 6efb8538d08..a93a59ac30d 100644 --- a/python/src/indices.rs +++ b/python/src/indices.rs @@ -634,6 +634,9 @@ pub struct PyIndexSegmentDescription { /// The total size in bytes of all files in this segment /// (None for backward compatibility with indices created before file tracking) pub size_bytes: Option, + /// The id of the dataset base path that stores this segment + /// (None when the segment is stored in the dataset's default base path) + pub base_id: Option, } impl PyIndexSegmentDescription { @@ -652,18 +655,20 @@ impl PyIndexSegmentDescription { index_version: segment.index_version, created_at: segment.created_at, size_bytes, + base_id: segment.base_id.map(|id| id as i64), } } pub fn __repr__(&self) -> String { format!( - "IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?})", + "IndexSegmentDescription(uuid={}, dataset_version_at_last_update={}, fragment_ids={:?}, index_version={}, created_at={:?}, size_bytes={:?}, base_id={:?})", self.uuid, self.dataset_version_at_last_update, self.fragment_ids, self.index_version, self.created_at, - self.size_bytes + self.size_bytes, + self.base_id ) } } @@ -678,7 +683,8 @@ pub struct PyIndexDescription { pub index_type: String, /// The ids of the fields that the index is built on pub fields: Vec, - /// The names of the fields that the index is built on + /// The full paths of the fields that the index is built on + /// (dotted, with backtick-quoted segments for non-identifier names) pub field_names: Vec, /// The number of rows indexed by the index pub num_rows_indexed: u64, @@ -699,9 +705,8 @@ impl PyIndexDescription { .map(|field| { dataset .schema() - .field_by_id(*field as i32) - .map(|f| f.name.clone()) - .unwrap_or("".to_string()) + .field_path(*field as i32) + .unwrap_or_else(|_| "".to_string()) }) .collect(); diff --git a/rust/lance-index/src/registry.rs b/rust/lance-index/src/registry.rs index 1608baec8e6..a641bd2b8d0 100644 --- a/rust/lance-index/src/registry.rs +++ b/rust/lance-index/src/registry.rs @@ -16,6 +16,21 @@ use crate::{ }, }; +/// Derive a human-readable index type name from a details type URL. +/// +/// The display name is the final `.`-separated segment of the type URL with any +/// trailing `IndexDetails` removed. For example, `/lance.index.pb.VectorIndexDetails` +/// yields `Vector`. Used as a best-effort fallback when no plugin is registered +/// for the type URL, so the index type is never reported as opaque "Unknown" +/// while valid index details exist. +pub fn display_type_from_url(type_url: &str) -> &str { + let segment = type_url.rsplit('.').next().unwrap_or(type_url); + segment + .strip_suffix("IndexDetails") + .filter(|stripped| !stripped.is_empty()) + .unwrap_or(segment) +} + /// A registry of index plugins pub struct IndexPluginRegistry { plugins: HashMap>, @@ -112,6 +127,24 @@ impl IndexPluginRegistry { mod tests { use super::*; + #[test] + fn test_display_type_from_url() { + assert_eq!( + display_type_from_url("/lance.index.pb.VectorIndexDetails"), + "Vector" + ); + assert_eq!(display_type_from_url("BTreeIndexDetails"), "BTree"); + // Segment without the IndexDetails suffix is returned verbatim. + assert_eq!( + display_type_from_url("/lance.pb.SomethingElse"), + "SomethingElse" + ); + // A bare "IndexDetails" segment has nothing left after stripping, so it + // is returned as-is rather than an empty string. + assert_eq!(display_type_from_url("IndexDetails"), "IndexDetails"); + assert_eq!(display_type_from_url(""), ""); + } + #[test] fn test_get_plugin_by_name_accepts_case_insensitive_builtin_names() { let registry = IndexPluginRegistry::with_default_plugins(); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 3a5f9975810..cc6dad8d1f3 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -47,6 +47,7 @@ use lance_index::{INDEX_FILE_NAME, Index, IndexType, PrewarmOptions, pb, vector: use lance_index::{ IndexCriteria, is_system_index, metrics::{MetricsCollector, NoOpMetricsCollector}, + registry::display_type_from_url, scalar::btree::BTREE_LOOKUP_NAME, }; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; @@ -673,7 +674,10 @@ struct IndexDescriptionImpl { field_ids: Vec, segments: Vec, index_type: String, - details: IndexDetails, + /// Index details, or `None` for indices created before details were + /// persisted in the manifest. Such indices are still described on a + /// best-effort basis rather than rejected. + details: Option, rows_indexed: u64, } @@ -701,57 +705,52 @@ impl IndexDescriptionImpl { } let field_ids_vec: Vec = field_ids.iter().map(|id| *id as u32).collect(); - // This should not fail as we have already filtered out indexes without index details. - let index_details = example_metadata.index_details.as_ref().ok_or_else(|| { - let fields = field_ids - .iter() - .map(|id| { - dataset - .schema() - .field_by_id(*id) - .map(|f| format!("{}({})", f.name, id)) - .unwrap_or_else(|| format!("({})", id)) - }) - .collect::>() - .join(", "); - - Error::index(format!( - "Index details are required for index description. This index must be retrained to support this method. (index_name={}, uuid={}, fields=[{}])", - name, - example_metadata.uuid, - fields - )) - })?; - let type_url = &index_details.type_url; - if !segments.iter().all(|shard| { - shard - .index_details - .as_ref() - .map(|d| d.type_url == *type_url) - .unwrap_or(false) - }) { - return Err(Error::index( - "Index type URL should be present and identical across all segments".to_string(), - )); + // Index details may be absent on indices created before details were + // persisted in the manifest. We describe such indices on a best-effort + // basis rather than erroring, so callers can still see they exist. + let details = example_metadata.index_details.clone().map(IndexDetails); + if let Some(details) = details.as_ref() { + let type_url = &details.0.type_url; + if !segments.iter().all(|shard| { + shard + .index_details + .as_ref() + .map(|d| d.type_url == *type_url) + .unwrap_or(false) + }) { + return Err(Error::index( + "Index type URL should be present and identical across all segments" + .to_string(), + )); + } } - let details = IndexDetails(index_details.clone()); - - let index_type = if details.is_vector() { - derive_vector_index_type(index_details) - } else if let Some(system_type) = lance_index::infer_system_index_type(example_metadata) { - // System indices (frag-reuse, mem-wal) are identified by name, not - // by a plugin entry, so the plugin lookup below would return - // "Unknown" otherwise. - system_type.to_string() - } else { - // We attempted to infer the index type when we loaded the indices, - // so if we hit this branch the index type is truly unknown. - details - .get_plugin() - .map(|p| p.name().to_string()) - .unwrap_or_else(|_| "Unknown".to_string()) - }; + let index_type = + if let Some(system_type) = lance_index::infer_system_index_type(example_metadata) { + // System indices (frag-reuse, mem-wal) are identified by name, not + // by index details, so this must be checked before the plugin lookup. + system_type.to_string() + } else if let Some(details) = details.as_ref() { + if details.is_vector() { + derive_vector_index_type(&details.0) + } else { + // Fall back to a name derived from the type URL when no plugin + // is registered, so a known type URL is never reported as the + // opaque "Unknown". + details + .get_plugin() + .map(|p| p.name().to_string()) + .unwrap_or_else(|_| { + display_type_from_url(details.0.type_url.as_str()).to_string() + }) + } + } else if segment_has_vector_details(example_metadata) { + // Legacy vector indices predate VectorIndexDetails and are + // recognized by their monolithic index file name. + "Vector".to_string() + } else { + "Unknown".to_string() + }; let mut fragment_rows = HashMap::with_capacity(dataset.manifest.fragments.len()); for fragment in dataset.iter_fragments() { @@ -828,7 +827,10 @@ impl IndexDescription for IndexDescriptionImpl { } fn type_url(&self) -> &str { - self.details.0.type_url.as_str() + self.details + .as_ref() + .map(|d| d.0.type_url.as_str()) + .unwrap_or("") } fn rows_indexed(&self) -> u64 { @@ -836,13 +838,14 @@ impl IndexDescription for IndexDescriptionImpl { } fn details(&self) -> Result { - if self.details.is_vector() { - vector_details_as_json(&self.details.0) + let Some(details) = self.details.as_ref() else { + return Ok("{}".to_string()); + }; + if details.is_vector() { + vector_details_as_json(&details.0) } else { - let plugin = self.details.get_plugin()?; - plugin - .details_as_json(&self.details.0) - .map(|v| v.to_string()) + let plugin = details.get_plugin()?; + plugin.details_as_json(&details.0).map(|v| v.to_string()) } } @@ -4463,6 +4466,81 @@ mod tests { assert_eq!(descriptions[0].index_type(), inferred_type); } + #[tokio::test] + async fn test_describe_indices_tolerates_missing_index_details() { + // An index whose manifest entry has no index details (e.g. created + // before details were persisted) is still described on a best-effort + // basis rather than causing describe_indices to error. + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let field_id = dataset.schema().field("id").unwrap().id; + + let metadata = IndexMetadata { + uuid: Uuid::new_v4(), + name: "mystery_idx".to_string(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(std::iter::once(0_u32).collect()), + index_details: None, + index_version: 0, + created_at: None, + base_id: None, + files: None, + }; + + let desc = IndexDescriptionImpl::try_new(vec![metadata], &dataset) + .await + .unwrap(); + assert_eq!(desc.index_type(), "Unknown"); + assert_eq!(desc.type_url(), ""); + assert_eq!(desc.details().unwrap(), "{}"); + assert_eq!(desc.rows_indexed(), 10); + } + + #[tokio::test] + async fn test_describe_indices_derives_type_from_url_without_plugin() { + // When index details exist but no plugin is registered for the type + // URL, the index type is derived from the type URL rather than being + // reported as the opaque "Unknown". + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write(reader, test_uri, None).await.unwrap(); + let field_id = dataset.schema().field("id").unwrap().id; + + let metadata = IndexMetadata { + uuid: Uuid::new_v4(), + name: "mystery_idx".to_string(), + fields: vec![field_id], + dataset_version: dataset.manifest.version, + fragment_bitmap: Some(std::iter::once(0_u32).collect()), + index_details: Some(Arc::new(prost_types::Any { + type_url: "/lance.index.pb.MysteryIndexDetails".to_string(), + value: Vec::new(), + })), + index_version: 0, + created_at: None, + base_id: None, + files: None, + }; + + let desc = IndexDescriptionImpl::try_new(vec![metadata], &dataset) + .await + .unwrap(); + assert_eq!(desc.index_type(), "Mystery"); + assert_eq!(desc.type_url(), "/lance.index.pb.MysteryIndexDetails"); + } + #[rstest] #[case::btree("i", IndexType::BTree, Box::new(ScalarIndexParams::default()))] #[case::bitmap("i", IndexType::Bitmap, Box::new(ScalarIndexParams::default()))] From 9c81ac33df1d78e615502b3fb9fc5f843f1fdbd9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 5 Jun 2026 16:13:03 -0700 Subject: [PATCH 044/177] feat: add EnforceDistribution to physical optimizer (#7086) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Adds `EnforceDistribution` to `get_physical_optimizer()` after `LimitPushdown`. This rule automatically inserts `CoalescePartitionsExec`/`RepartitionExec` exchange nodes where distribution requirements aren't satisfied, laying groundwork to remove manual exchange insertion in `scanner.rs` as exec nodes migrate to multi-partition output. - Wires `target_partitions` into the `ConfigOptions` passed to the optimizer. Previously `Default::default()` was used, ignoring the user's concurrency setting. `Scanner` now carries an optional `target_partitions` field (defaulting to `get_num_compute_intensive_cpus()`). - Fixes `KNNVectorDistanceExec` non-batch mode to correctly declare `required_input_distribution() = SinglePartition`. The execute path reads a single input partition, so all candidates must be coalesced before distance computation; the prior `UnspecifiedDistribution` declaration was incorrect. All snapshot tests updated: manual `RepartitionExec(1)` nodes preceding `SinglePartition`-requiring execs are replaced by `CoalescePartitionsExec`; unnecessary repartitions under `UnspecifiedDistribution` parents are removed. `assert_plan_equals` sets `target_partitions=1` to make plan snapshots machine-independent. Closes #7081. Part of #6967. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance-datafusion/src/exec.rs | 13 ++- rust/lance/src/dataset/scanner.rs | 100 +++++++++++------- .../src/dataset/tests/dataset_aggregate.rs | 3 + rust/lance/src/io/exec/knn.rs | 8 +- rust/lance/src/io/exec/optimizer.rs | 3 + 5 files changed, 82 insertions(+), 45 deletions(-) diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index 5d7c5465132..8f346f45612 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -28,6 +28,7 @@ use datafusion::{ physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, analyze::AnalyzeExec, + coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, execution_plan::{Boundedness, CardinalityEffect, EmissionType}, metrics::MetricValue, @@ -606,9 +607,15 @@ pub fn execute_plan( let session_ctx = get_session_context(&options); - // NOTE: we are only executing the first partition here. Therefore, if - // the plan has more than one partition, we will be missing data. - assert_eq!(plan.properties().partitioning.partition_count(), 1); + // Coalesce to a single partition if the optimizer left more than one. + // EnforceDistribution may remove RepartitionExec(1) nodes when the parent + // declares UnspecifiedDistribution, leaving multi-partition plans here. + let plan: Arc = if plan.properties().partitioning.partition_count() == 1 { + plan + } else { + Arc::new(CoalescePartitionsExec::new(plan)) + }; + let stream = plan.execute(0, get_task_context(&session_ctx, &options))?; let schema = stream.schema(); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 74f2619f7c2..38328894d2e 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -829,6 +829,15 @@ pub struct Scanner { /// Which version of the relational algebra to use when generating the physical plan relational_algebra_version: u32, + /// Target degree of parallelism for the physical optimizer. + /// + /// This is passed as `ConfigOptions::execution::target_partitions` to the + /// physical optimizer (e.g. `EnforceDistribution`), which uses it to decide + /// how many parallel partitions to target when inserting exchange nodes. + /// + /// Defaults to `get_num_compute_intensive_cpus()`. + target_parallelism: Option, + // Legacy fields to help migrate some old projection behavior to new behavior // // There are two behaviors we are moving away from: @@ -1053,6 +1062,7 @@ impl Scanner { explicit_projection: false, autoproject_scoring_columns: true, relational_algebra_version: LANCE_RELATIONAL_ALGEBRA_VERSION, + target_parallelism: None, }; scanner.apply_blob_handling(); scanner @@ -1379,6 +1389,16 @@ impl Scanner { self } + /// Set the target number of partitions for the physical optimizer. + /// + /// Overrides the default (`get_num_compute_intensive_cpus()`). Used by + /// `EnforceDistribution` and similar rules to decide how many parallel + /// partitions to use. Set to 1 in tests that assert specific plan shapes. + pub fn target_parallelism(&mut self, n: usize) -> &mut Self { + self.target_parallelism = Some(n); + self + } + /// Set whether to read data in order (default: true) /// /// A scan will always read from the disk concurrently. If this property @@ -2617,7 +2637,10 @@ impl Scanner { plan = self.apply_aggregate(plan, agg).await?; let optimizer = get_physical_optimizer(); - let options = Default::default(); + let mut options = ConfigOptions::default(); + options.execution.target_partitions = self + .target_parallelism + .unwrap_or_else(get_num_compute_intensive_cpus); for rule in optimizer.rules { plan = rule.optimize(plan, &options)?; } @@ -2681,7 +2704,10 @@ impl Scanner { } let optimizer = get_physical_optimizer(); - let options: ConfigOptions = Default::default(); + let mut options = ConfigOptions::default(); + options.execution.target_partitions = self + .target_parallelism + .unwrap_or_else(get_num_compute_intensive_cpus); for rule in optimizer.rules { plan = rule.optimize(plan, &options)?; } @@ -8297,6 +8323,9 @@ mod test { expected: &str, ) -> Result<()> { let mut scan = dataset.scan(); + // Pin target_parallelism=1 so EnforceDistribution produces deterministic plans + // regardless of the machine's CPU count. + scan.target_parallelism(1); plan(&mut scan)?; let exec_plan = scan.create_plan().await?; assert_plan_node_equals(exec_plan, expected).await @@ -9675,7 +9704,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=6), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9707,7 +9736,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=15), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9735,7 +9764,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=5), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9757,7 +9786,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=5), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9856,7 +9885,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=8), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9892,7 +9921,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@... IS NOT NULL SortExec: TopK(fetch=11), expr=... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@3 as _distance, _rowid@2 as _rowid, vec@0 as vec] FilterExec: _distance@... IS NOT NULL @@ -9985,14 +10014,13 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Combined Scalar/non-scalar filtered read"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[s@1 as s] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - Take: columns=\"_rowid, (s)\" - CoalesceBatchesExec: target_batch_size=8192 - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] - FilterExec: i@0 > 10 - LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None" + UnionExec + Take: columns=\"_rowid, (s)\" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None" } else { "LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \ range_after=None, row_id=false, row_addr=false, full_filter=i > Int32(10), refine_filter=-- @@ -10008,13 +10036,12 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Combined Scalar/non-scalar filtered read with empty projection"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[_rowaddr@0 as _rowaddr] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - AddRowAddrExec - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid] - FilterExec: i@0 > 10 - LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None" + UnionExec + AddRowAddrExec + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowaddr@2 as _rowaddr, _rowid@1 as _rowid] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i], row_id=true, row_addr=true, ordered=false, range=None" } else { "LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, \ range_after=None, row_id=false, row_addr=true, full_filter=i > Int32(10), refine_filter=-- @@ -10037,14 +10064,13 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") log::info!("Test case: Dynamic projection"); let expected = if data_storage_version == LanceFileVersion::Legacy { "ProjectionExec: expr=[regexp_match(s@1, .*) as matches] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 - UnionExec - Take: columns=\"_rowid, (s)\" - CoalesceBatchesExec: target_batch_size=8192 - MaterializeIndex: query=[i > 10]@i_idx(BTree) - ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] - FilterExec: i@0 > 10 - LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None" + UnionExec + Take: columns=\"_rowid, (s)\" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., row_id=true, row_addr=false, ordered=false, range=None" } else { "ProjectionExec: expr=[regexp_match(s@0, .*) as matches] LanceRead: uri=..., projection=[s], num_fragments=5, range_before=None, \ @@ -10127,7 +10153,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 MatchQuery: column=s, query=hello - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MaterializeIndex: query=[i > 10]@i_idx(BTree) ProjectionExec: expr=[_rowid@1 as _rowid] @@ -10159,7 +10185,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello FlatMatchQuery: column=s, query=hello @@ -10201,10 +10227,10 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MaterializeIndex: query=[i > 10]@i_idx(BTree) ProjectionExec: expr=[_rowid@1 as _rowid] @@ -10218,7 +10244,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec MatchQuery: column=s, query=hello LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- @@ -10295,7 +10321,7 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: _distance@2 IS NOT NULL SortExec: TopK(fetch=34), expr=[_distance@2 ASC NULLS LAST, _rowid@0 ASC NULLS LAST]... KNNVectorDistance: metric=l2 - RepartitionExec: partitioning=RoundRobinBatch(1), input_partitions=2 + CoalescePartitionsExec UnionExec ProjectionExec: expr=[_distance@2 as _distance, _rowid@1 as _rowid, vec@0 as vec] FilterExec: _distance@2 IS NOT NULL diff --git a/rust/lance/src/dataset/tests/dataset_aggregate.rs b/rust/lance/src/dataset/tests/dataset_aggregate.rs index 35bf0ac1f29..d10a3e42769 100644 --- a/rust/lance/src/dataset/tests/dataset_aggregate.rs +++ b/rust/lance/src/dataset/tests/dataset_aggregate.rs @@ -1351,6 +1351,9 @@ async fn test_scanner_count_rows_with_partial_index_coverage() { scanner .aggregate(AggregateExpr::builder().count_star().build()) .unwrap(); + // Pin target_parallelism=1 so EnforceDistribution produces a deterministic + // plan snapshot regardless of the machine's CPU count. + scanner.target_parallelism(1); let plan = scanner.create_plan().await.unwrap(); assert_plan_node_equals( diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 860e4322ebf..f6d1bc06636 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -637,11 +637,9 @@ impl ExecutionPlan for KNNVectorDistanceExec { } fn required_input_distribution(&self) -> Vec { - if self.is_batch { - vec![Distribution::SinglePartition] - } else { - vec![Distribution::UnspecifiedDistribution] - } + // Both batch and non-batch modes execute a single input partition at a time, + // so all input must be coalesced to one partition before distance computation. + vec![Distribution::SinglePartition] } } diff --git a/rust/lance/src/io/exec/optimizer.rs b/rust/lance/src/io/exec/optimizer.rs index 96a93823424..72488f3a14e 100644 --- a/rust/lance/src/io/exec/optimizer.rs +++ b/rust/lance/src/io/exec/optimizer.rs @@ -179,5 +179,8 @@ pub fn get_physical_optimizer() -> PhysicalOptimizer { Arc::new(crate::io::exec::optimizer::SimplifyProjection), // Push down limit into FilteredReadExec and other Execs via with_fetch() Arc::new(datafusion::physical_optimizer::limit_pushdown::LimitPushdown::new()), + // Insert exchange nodes (RepartitionExec, CoalescePartitionsExec) where needed + // to satisfy distribution requirements as exec nodes migrate to multi-partition output. + Arc::new(datafusion::physical_optimizer::enforce_distribution::EnforceDistribution::new()), ]) } From 352f4322753d02a31826ebca9872e75fcb9949a1 Mon Sep 17 00:00:00 2001 From: WenDing-Y <1062698930@qq.com> Date: Sat, 6 Jun 2026 07:39:18 +0800 Subject: [PATCH 045/177] feat(java): add missing scanner and merge insert params to align with Python/Rust (#7100) ## feat(java): add missing scanner and merge insert params to align with Python/Rust ### Summary Add 4 parameters to the Java SDK that already exist in the Rust core and Python bindings but were absent in Java. ### Changes | Param | Type | API | Rust method | |-------|------|-----|-------------| | includeDeletedRows | boolean | ScanOptions | Scanner::include_deleted_rows() | | strictBatchSize | boolean | ScanOptions | Scanner::strict_batch_size() | | disableScoringAutoprojection | boolean | ScanOptions | Scanner::disable_scoring_autoprojection() | | useIndex | boolean | MergeInsertParams | MergeInsertBuilder::use_index() | ### Files Changed - `java/lance-jni/src/async_scanner.rs` - `java/lance-jni/src/blocking_scanner.rs` - `java/lance-jni/src/merge_insert.rs` - `java/src/main/java/org/lance/ipc/AsyncScanner.java` - `java/src/main/java/org/lance/ipc/LanceScanner.java` - `java/src/main/java/org/lance/ipc/ScanOptions.java` - `java/src/main/java/org/lance/merge/MergeInsertParams.java` - `java/src/test/java/org/lance/AsyncScannerTest.java` - `java/src/test/java/org/lance/MergeInsertTest.java` - `java/src/test/java/org/lance/ScannerTest.java` **+376 insertions, -6 deletions** --- java/lance-jni/src/async_scanner.rs | 12 ++ java/lance-jni/src/blocking_scanner.rs | 25 ++++ java/lance-jni/src/merge_insert.rs | 7 ++ .../main/java/org/lance/ipc/AsyncScanner.java | 10 +- .../main/java/org/lance/ipc/LanceScanner.java | 10 +- .../main/java/org/lance/ipc/ScanOptions.java | 88 +++++++++++++- .../org/lance/merge/MergeInsertParams.java | 22 ++++ .../test/java/org/lance/AsyncScannerTest.java | 69 +++++++++++ .../test/java/org/lance/MergeInsertTest.java | 23 ++++ java/src/test/java/org/lance/ScannerTest.java | 114 ++++++++++++++++++ 10 files changed, 374 insertions(+), 6 deletions(-) diff --git a/java/lance-jni/src/async_scanner.rs b/java/lance-jni/src/async_scanner.rs index 7cb71c37086..6da10479266 100644 --- a/java/lance-jni/src/async_scanner.rs +++ b/java/lance-jni/src/async_scanner.rs @@ -193,6 +193,9 @@ pub extern "system" fn Java_org_lance_ipc_AsyncScanner_createAsyncScanner<'local use_scalar_index: jboolean, fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> JObject<'local> { crate::ok_or_throw!( env, @@ -216,6 +219,9 @@ pub extern "system" fn Java_org_lance_ipc_AsyncScanner_createAsyncScanner<'local use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, ) ) } @@ -241,6 +247,9 @@ fn inner_create_async_scanner<'local>( use_scalar_index: jboolean, fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> Result> { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) }?; @@ -265,6 +274,9 @@ fn inner_create_async_scanner<'local>( use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, }; let scanner = build_scanner_with_options(env, &dataset, options)?; diff --git a/java/lance-jni/src/blocking_scanner.rs b/java/lance-jni/src/blocking_scanner.rs index f18b0d92a27..700dea2f944 100644 --- a/java/lance-jni/src/blocking_scanner.rs +++ b/java/lance-jni/src/blocking_scanner.rs @@ -247,6 +247,9 @@ pub(crate) struct ScannerOptions<'a> { pub use_scalar_index: jboolean, pub fast_search: jboolean, pub substrait_aggregate_obj: JObject<'a>, + pub include_deleted_rows: jboolean, + pub strict_batch_size: jboolean, + pub disable_scoring_autoprojection: jboolean, } /// Build a scanner with options applied - shared by blocking and async scanners @@ -394,6 +397,16 @@ pub(crate) fn build_scanner_with_options<'a>( scanner.aggregate(AggregateExpr::substrait(substrait_aggregate))?; } + if options.include_deleted_rows == JNI_TRUE { + scanner.include_deleted_rows(); + } + + scanner.strict_batch_size(options.strict_batch_size == JNI_TRUE); + + if options.disable_scoring_autoprojection == JNI_TRUE { + scanner.disable_scoring_autoprojection(); + } + Ok(scanner) } @@ -423,6 +436,9 @@ pub extern "system" fn Java_org_lance_ipc_LanceScanner_createScanner<'local>( fast_search: jboolean, // boolean substrait_aggregate_obj: JObject<'local>, // Optional collect_stats: jboolean, // boolean + include_deleted_rows: jboolean, // boolean + strict_batch_size: jboolean, // boolean + disable_scoring_autoprojection: jboolean, // boolean ) -> JObject<'local> { ok_or_throw!( env, @@ -447,6 +463,9 @@ pub extern "system" fn Java_org_lance_ipc_LanceScanner_createScanner<'local>( fast_search, substrait_aggregate_obj, collect_stats, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, ) ) } @@ -473,6 +492,9 @@ fn inner_create_scanner<'local>( fast_search: jboolean, substrait_aggregate_obj: JObject<'local>, collect_stats: jboolean, + include_deleted_rows: jboolean, + strict_batch_size: jboolean, + disable_scoring_autoprojection: jboolean, ) -> Result> { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) }?; @@ -497,6 +519,9 @@ fn inner_create_scanner<'local>( use_scalar_index, fast_search, substrait_aggregate_obj, + include_deleted_rows, + strict_batch_size, + disable_scoring_autoprojection, }; let scanner = build_scanner_with_options(env, &dataset, options)?; diff --git a/java/lance-jni/src/merge_insert.rs b/java/lance-jni/src/merge_insert.rs index 0898d1b049f..df4d63bd2f6 100644 --- a/java/lance-jni/src/merge_insert.rs +++ b/java/lance-jni/src/merge_insert.rs @@ -51,6 +51,7 @@ fn inner_merge_insert<'local>( let conflict_retries = extract_conflict_retries(env, &jparam)?; let retry_timeout_ms = extract_retry_timeout_ms(env, &jparam)?; let skip_auto_cleanup = extract_skip_auto_cleanup(env, &jparam)?; + let use_index = extract_use_index(env, &jparam)?; let marked_generations = extract_marked_generations(env, &jparam)?; let (new_ds, merge_stats) = unsafe { @@ -69,6 +70,7 @@ fn inner_merge_insert<'local>( .conflict_retries(conflict_retries) .retry_timeout(Duration::from_millis(retry_timeout_ms as u64)) .skip_auto_cleanup(skip_auto_cleanup) + .use_index(use_index) .mark_generations_as_merged(marked_generations) .try_build()?; @@ -234,6 +236,11 @@ fn extract_skip_auto_cleanup<'local>(env: &mut JNIEnv<'local>, jparam: &JObject) Ok(skip_auto_cleanup) } +fn extract_use_index<'local>(env: &mut JNIEnv<'local>, jparam: &JObject) -> Result { + let use_index = env.call_method(jparam, "useIndex", "()Z", &[])?.z()?; + Ok(use_index) +} + fn extract_marked_generations<'local>( env: &mut JNIEnv<'local>, jparam: &JObject, diff --git a/java/src/main/java/org/lance/ipc/AsyncScanner.java b/java/src/main/java/org/lance/ipc/AsyncScanner.java index 2ec317cb245..6e515e3546c 100644 --- a/java/src/main/java/org/lance/ipc/AsyncScanner.java +++ b/java/src/main/java/org/lance/ipc/AsyncScanner.java @@ -80,7 +80,10 @@ public static AsyncScanner create( options.getColumnOrderings(), options.isUseScalarIndex(), options.isFastSearch(), - options.getSubstraitAggregate()); + options.getSubstraitAggregate(), + options.isIncludeDeletedRows(), + options.isStrictBatchSize(), + options.isDisableScoringAutoprojection()); scanner.allocator = allocator; return scanner; } @@ -103,7 +106,10 @@ static native AsyncScanner createAsyncScanner( Optional> columnOrderings, boolean useScalarIndex, boolean fastSearch, - Optional substraitAggregate); + Optional substraitAggregate, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection); /** * Asynchronously scan batches and return a CompletableFuture. diff --git a/java/src/main/java/org/lance/ipc/LanceScanner.java b/java/src/main/java/org/lance/ipc/LanceScanner.java index edd3ebc22cc..3a413e0ccfd 100644 --- a/java/src/main/java/org/lance/ipc/LanceScanner.java +++ b/java/src/main/java/org/lance/ipc/LanceScanner.java @@ -77,7 +77,10 @@ public static LanceScanner create( options.isUseScalarIndex(), options.isFastSearch(), options.getSubstraitAggregate(), - options.isCollectStats()); + options.isCollectStats(), + options.isIncludeDeletedRows(), + options.isStrictBatchSize(), + options.isDisableScoringAutoprojection()); scanner.allocator = allocator; scanner.dataset = dataset; scanner.options = options; @@ -103,7 +106,10 @@ static native LanceScanner createScanner( boolean useScalarIndex, boolean fastSearch, Optional substraitAggregate, - boolean collectStats); + boolean collectStats, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection); /** * Closes this scanner and releases any system resources associated with it. If the scanner is diff --git a/java/src/main/java/org/lance/ipc/ScanOptions.java b/java/src/main/java/org/lance/ipc/ScanOptions.java index 68c485e39a3..a9aad590c2b 100644 --- a/java/src/main/java/org/lance/ipc/ScanOptions.java +++ b/java/src/main/java/org/lance/ipc/ScanOptions.java @@ -40,6 +40,9 @@ public class ScanOptions { private final Optional substraitAggregate; private final boolean collectStats; private final boolean fastSearch; + private final boolean includeDeletedRows; + private final boolean strictBatchSize; + private final boolean disableScoringAutoprojection; public ScanOptions( Optional> fragmentIds, @@ -77,6 +80,9 @@ public ScanOptions( useScalarIndex, substraitAggregate, collectStats, + false, + false, + false, false); } @@ -121,7 +127,10 @@ public ScanOptions( boolean useScalarIndex, Optional substraitAggregate, boolean collectStats, - boolean fastSearch) { + boolean fastSearch, + boolean includeDeletedRows, + boolean strictBatchSize, + boolean disableScoringAutoprojection) { Preconditions.checkArgument( !(filter.isPresent() && substraitFilter.isPresent()), "cannot set both substrait filter and string filter"); @@ -143,6 +152,9 @@ public ScanOptions( this.substraitAggregate = substraitAggregate; this.collectStats = collectStats; this.fastSearch = fastSearch; + this.includeDeletedRows = includeDeletedRows; + this.strictBatchSize = strictBatchSize; + this.disableScoringAutoprojection = disableScoringAutoprojection; } /** @@ -297,6 +309,33 @@ public boolean isCollectStats() { return collectStats; } + /** + * Get whether to include deleted rows in scan results. + * + * @return true if deleted rows should be included, false otherwise. + */ + public boolean isIncludeDeletedRows() { + return includeDeletedRows; + } + + /** + * Get whether to enforce strict batch sizing. + * + * @return true if batch sizes must be strictly enforced, false otherwise. + */ + public boolean isStrictBatchSize() { + return strictBatchSize; + } + + /** + * Get whether to disable scoring autoprojection. + * + * @return true if scoring column autoprojection is disabled, false otherwise. + */ + public boolean isDisableScoringAutoprojection() { + return disableScoringAutoprojection; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -322,6 +361,9 @@ public String toString() { "substraitAggregate", substraitAggregate.map(buf -> "ByteBuffer[" + buf.remaining() + " bytes]").orElse(null)) .add("collectStats", collectStats) + .add("includeDeletedRows", includeDeletedRows) + .add("strictBatchSize", strictBatchSize) + .add("disableScoringAutoprojection", disableScoringAutoprojection) .toString(); } @@ -345,6 +387,9 @@ public static class Builder { private boolean fastSearch = false; private Optional substraitAggregate = Optional.empty(); private boolean collectStats = false; + private boolean includeDeletedRows = false; + private boolean strictBatchSize = false; + private boolean disableScoringAutoprojection = false; public Builder() {} @@ -372,6 +417,9 @@ public Builder(ScanOptions options) { this.fastSearch = options.isFastSearch(); this.substraitAggregate = options.getSubstraitAggregate(); this.collectStats = options.isCollectStats(); + this.includeDeletedRows = options.isIncludeDeletedRows(); + this.strictBatchSize = options.isStrictBatchSize(); + this.disableScoringAutoprojection = options.isDisableScoringAutoprojection(); } /** @@ -577,6 +625,39 @@ public Builder collectStats(boolean collectStats) { return this; } + /** + * Set whether to include deleted rows in scan results. Default is false. + * + * @param includeDeletedRows whether to include deleted rows + * @return Builder instance for method chaining. + */ + public Builder includeDeletedRows(boolean includeDeletedRows) { + this.includeDeletedRows = includeDeletedRows; + return this; + } + + /** + * Set whether to enforce strict batch sizing. Default is false. + * + * @param strictBatchSize whether to enforce strict batch sizing + * @return Builder instance for method chaining. + */ + public Builder strictBatchSize(boolean strictBatchSize) { + this.strictBatchSize = strictBatchSize; + return this; + } + + /** + * Set whether to disable scoring column autoprojection. Default is false. + * + * @param disableScoringAutoprojection whether to disable autoprojection + * @return Builder instance for method chaining. + */ + public Builder disableScoringAutoprojection(boolean disableScoringAutoprojection) { + this.disableScoringAutoprojection = disableScoringAutoprojection; + return this; + } + /** * Build the LanceScanOptions instance. * @@ -601,7 +682,10 @@ public ScanOptions build() { useScalarIndex, substraitAggregate, collectStats, - fastSearch); + fastSearch, + includeDeletedRows, + strictBatchSize, + disableScoringAutoprojection); } } } diff --git a/java/src/main/java/org/lance/merge/MergeInsertParams.java b/java/src/main/java/org/lance/merge/MergeInsertParams.java index de40c9e4f1c..2ae27b67cba 100644 --- a/java/src/main/java/org/lance/merge/MergeInsertParams.java +++ b/java/src/main/java/org/lance/merge/MergeInsertParams.java @@ -38,6 +38,7 @@ public class MergeInsertParams { private int conflictRetries = 10; private long retryTimeoutMs = 30 * 1000; private boolean skipAutoCleanup = false; + private boolean useIndex = true; private List markedGenerations = Collections.emptyList(); public MergeInsertParams(List on) { @@ -227,6 +228,22 @@ public MergeInsertParams withSkipAutoCleanup(boolean skipAutoCleanup) { return this; } + /** + * Controls whether to use indices for the merge operation. + * + *

When set to false, forces a full table scan even if an index exists on the join key. This + * can be useful for benchmarking or when the optimizer chooses a suboptimal path. + * + *

Default is true (use index if available). + * + * @param useIndex Whether to use indices for the merge join + * @return This MergeInsertParams instance + */ + public MergeInsertParams withUseIndex(boolean useIndex) { + this.useIndex = useIndex; + return this; + } + /** * Mark MemWAL generations as merged into the base table. * @@ -298,6 +315,10 @@ public boolean skipAutoCleanup() { return skipAutoCleanup; } + public boolean useIndex() { + return useIndex; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -315,6 +336,7 @@ public String toString() { .add("conflictRetries", conflictRetries) .add("retryTimeoutMs", retryTimeoutMs) .add("skipAutoCleanup", skipAutoCleanup) + .add("useIndex", useIndex) .toString(); } diff --git a/java/src/test/java/org/lance/AsyncScannerTest.java b/java/src/test/java/org/lance/AsyncScannerTest.java index 578bf000755..fc786ff57c2 100644 --- a/java/src/test/java/org/lance/AsyncScannerTest.java +++ b/java/src/test/java/org/lance/AsyncScannerTest.java @@ -192,6 +192,75 @@ private static int countRows(ArrowReader reader) throws Exception { return rowCount; } + @Test + void testIncludeDeletedRowsAsync(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("async_scanner_include_deleted_rows").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + assertEquals(10, dataset.countRows()); + + // Delete half the rows + dataset.delete("id >= 5"); + assertEquals(5, dataset.countRows()); + + // Async scan without includeDeletedRows — should only see live rows + ScanOptions defaultOptions = new ScanOptions.Builder().batchSize(20L).build(); + try (AsyncScanner scanner = AsyncScanner.create(dataset, defaultOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + assertEquals(5, countRows(reader), "default async scan: should exclude deleted rows"); + reader.close(); + } + + // Async scan with includeDeletedRows=true — should see all rows + ScanOptions includeDeletedOptions = + new ScanOptions.Builder() + .batchSize(20L) + .withRowId(true) // required by includeDeletedRows + .includeDeletedRows(true) + .build(); + try (AsyncScanner scanner = + AsyncScanner.create(dataset, includeDeletedOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + assertEquals( + 10, countRows(reader), "includeDeletedRows async: should include deleted rows"); + reader.close(); + } + } + } + } + + @Test + void testStrictBatchSizeAsync(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("async_scanner_strict_batch_size").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 25)) { + int batchSize = 10; + + ScanOptions strictOptions = + new ScanOptions.Builder().batchSize(batchSize).strictBatchSize(true).build(); + + try (AsyncScanner scanner = AsyncScanner.create(dataset, strictOptions, allocator)) { + ArrowReader reader = scanner.scanBatchesAsync().get(10, TimeUnit.SECONDS); + int totalRows = 0; + while (reader.loadNextBatch()) { + int rows = reader.getVectorSchemaRoot().getRowCount(); + assertTrue( + rows <= batchSize, "strict async: batch " + rows + " should be <= " + batchSize); + totalRows += rows; + } + assertEquals(25, totalRows, "strictBatchSize async: should read all rows"); + reader.close(); + } + } + } + } + /** * Example 3: Multiple concurrent async scans. * diff --git a/java/src/test/java/org/lance/MergeInsertTest.java b/java/src/test/java/org/lance/MergeInsertTest.java index c36ec26b4fa..b738ef8852d 100644 --- a/java/src/test/java/org/lance/MergeInsertTest.java +++ b/java/src/test/java/org/lance/MergeInsertTest.java @@ -275,6 +275,29 @@ private ArrowArrayStream convertToStream(VectorSchemaRoot root, RootAllocator al return stream; } + @Test + public void testMergeInsertWithoutIndex() throws Exception { + // Verify that merge insert with useIndex=false still completes and + // produces results consistent with the default (useIndex=true). + + try (VectorSchemaRoot source = buildSource(testDataset.getSchema(), allocator)) { + try (ArrowArrayStream sourceStream = convertToStream(source, allocator)) { + MergeInsertResult result = + dataset.mergeInsert( + new MergeInsertParams(Collections.singletonList("id")) + .withMatchedUpdateAll() + .withNotMatched(MergeInsertParams.WhenNotMatched.InsertAll) + .withUseIndex(false), + sourceStream); + + Assertions.assertEquals( + "{0=Source 0, 1=Source 1, 2=Source 2, 3=Person 3, 4=Person 4, 7=Source 7, 8=Source 8, 9=Source 9}", + readAll(result.dataset()).toString(), + "merge insert with useIndex=false should produce correct upsert results"); + } + } + } + private TreeMap readAll(Dataset dataset) throws Exception { try (ArrowReader reader = dataset.newScan().scanBatches()) { TreeMap map = new TreeMap<>(); diff --git a/java/src/test/java/org/lance/ScannerTest.java b/java/src/test/java/org/lance/ScannerTest.java index 894b208e8af..00434034b64 100644 --- a/java/src/test/java/org/lance/ScannerTest.java +++ b/java/src/test/java/org/lance/ScannerTest.java @@ -697,6 +697,120 @@ void testFastSearchSkipsUnindexedFragments(@TempDir Path tempDir) throws Excepti } } + @Test + void testIncludeDeletedRows(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("include_deleted_rows").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + assertEquals(10, dataset.countRows()); + + // Delete rows where id >= 5 + dataset.delete("id >= 5"); + assertEquals(5, dataset.countRows()); + + // Default scan should exclude deleted rows + try (LanceScanner scanner = + dataset.newScan(new ScanOptions.Builder().batchSize(20).build())) { + assertEquals(5, scanner.countRows(), "default scan: should exclude deleted rows"); + } + + // includeDeletedRows=true should surface deleted rows + // NOTE: includeDeletedRows requires withRowId=true + try (LanceScanner scanner = + dataset.newScan( + new ScanOptions.Builder() + .batchSize(20) + .withRowId(true) + .includeDeletedRows(true) + .build())) { + assertEquals(10, scanner.countRows(), "includeDeletedRows: should include deleted rows"); + } + } + } + } + + @Test + void testStrictBatchSize(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("strict_batch_size").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 25)) { + int batchSize = 10; + + // With strictBatchSize=true, no batch should exceed batchSize + try (Scanner scanner = + dataset.newScan( + new ScanOptions.Builder().batchSize(batchSize).strictBatchSize(true).build())) { + try (ArrowReader reader = scanner.scanBatches()) { + int totalRows = 0; + while (reader.loadNextBatch()) { + int rows = reader.getVectorSchemaRoot().getRowCount(); + assertTrue(rows <= batchSize, "strict: batch " + rows + " should be <= " + batchSize); + totalRows += rows; + } + assertEquals(25, totalRows); + } + } + + // strictBatchSize=false (default) — batch size may vary + try (Scanner scanner = + dataset.newScan(new ScanOptions.Builder().batchSize(batchSize).build())) { + try (ArrowReader reader = scanner.scanBatches()) { + int totalRows = 0; + while (reader.loadNextBatch()) { + totalRows += reader.getVectorSchemaRoot().getRowCount(); + } + assertEquals(25, totalRows); + } + } + } + } + } + + @Test + void testDisableScoringAutoprojection(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("disable_scoring_autoprojection").toString(); + try (BufferAllocator allocator = new RootAllocator()) { + TestUtils.SimpleTestDataset testDataset = + new TestUtils.SimpleTestDataset(allocator, datasetPath); + testDataset.createEmptyDataset().close(); + try (Dataset dataset = testDataset.write(1, 10)) { + // Smoke test: verify the option is accepted and scan still works + ScanOptions options = + new ScanOptions.Builder().batchSize(20).disableScoringAutoprojection(true).build(); + + try (LanceScanner scanner = dataset.newScan(options)) { + assertEquals( + 10, + scanner.countRows(), + "scan with disableScoringAutoprojection should return all rows"); + } + + // Also verify it doesn't break when combined with other options + ScanOptions combinedOptions = + new ScanOptions.Builder() + .batchSize(20) + .filter("id < 5") + .disableScoringAutoprojection(true) + .includeDeletedRows(false) + .strictBatchSize(false) + .build(); + + try (LanceScanner scanner = dataset.newScan(combinedOptions)) { + assertEquals( + 5, + scanner.countRows(), + "scan with disableScoringAutoprojection + filter should work"); + } + } + } + } + private void validScanResult(Dataset dataset, int fragmentId, int rowCount) throws Exception { try (Scanner scanner = dataset.newScan( From 466405f476bcd02b6dfa3f78133eaebd76158c66 Mon Sep 17 00:00:00 2001 From: Prashanth Rao <35005448+prrao87@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:46:12 -0400 Subject: [PATCH 046/177] fix(python): avoid pandas requirement for add_columns UDFs (#7131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This fixes `add_columns` UDF schema inference so Arrow-only callables do not require pandas to be installed. `pd` is a lazy optional-dependency proxy. In a pandas-free environment, evaluating `pd.DataFrame` raises `ModuleNotFoundError` even when the user UDF returns a `pyarrow.RecordBatch`. The fix uses the existing pandas availability guard before the DataFrame `isinstance` checks, matching the conversion path used when executing batch UDFs. The regression test simulates Lance's no-pandas lazy proxy state and verifies that an Arrow-only UDF still works with `batch_size`, covering both plain callables and `batch_udf()` schema inference. ## Validation - `uv run pytest python/tests/test_schema_evolution.py::test_add_columns_arrow_udf_without_pandas_dependency` - `uv run make lint` reached and passed Ruff; local Pyright then failed on existing optional `tensorflow` and `torch` imports in dependency typing on macOS. 🤖PR generated by Codex, but validated by @prrao87 and run with a local, fresh Python SDK build. --- python/python/lance/udf.py | 8 +++-- python/python/tests/test_schema_evolution.py | 33 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/python/python/lance/udf.py b/python/python/lance/udf.py index de6c7c4ff59..3a80349479e 100644 --- a/python/python/lance/udf.py +++ b/python/python/lance/udf.py @@ -205,7 +205,9 @@ def normalize_transform( ) ) ) - if isinstance(sample_batch, pd.DataFrame): + if _check_for_pandas(sample_batch) and isinstance( + sample_batch, pd.DataFrame + ): sample_batch = pa.RecordBatch.from_pandas(sample_batch) udf_like.output_schema = sample_batch.schema @@ -233,7 +235,9 @@ def normalize_transform( ) ) ) - if isinstance(sample_batch, pd.DataFrame): + if _check_for_pandas(sample_batch) and isinstance( + sample_batch, pd.DataFrame + ): sample_batch = pa.RecordBatch.from_pandas(sample_batch) udf_like = BatchUDF(udf_like, output_schema=sample_batch.schema) diff --git a/python/python/tests/test_schema_evolution.py b/python/python/tests/test_schema_evolution.py index 205aaa4fa66..7df6962789e 100644 --- a/python/python/tests/test_schema_evolution.py +++ b/python/python/tests/test_schema_evolution.py @@ -6,6 +6,8 @@ from pathlib import Path import lance +import lance.dependencies as dependencies +import lance.udf as udf_module import numpy as np import pandas as pd import pyarrow as pa @@ -287,6 +289,37 @@ def mapper(batch: pa.RecordBatch): check_add_columns(dataset, expected, use_fragments, mapper) +@pytest.mark.parametrize("use_batch_udf", [False, True]) +def test_add_columns_arrow_udf_without_pandas_dependency( + tmp_path: Path, monkeypatch, use_batch_udf +): + table = pa.table({"caption": ["a Shutterstock photo", "clean"]}) + dataset = lance.write_dataset(table, tmp_path) + + def mapper(batch: pa.RecordBatch) -> pa.RecordBatch: + flags = pc.match_substring_regex( + pc.utf8_lower(batch["caption"]), "shutterstock" + ) + return pa.record_batch([flags], names=["wm"]) + + if use_batch_udf: + mapper = lance.batch_udf()(mapper) + + # CI has pandas installed, so simulate Lance's no-pandas lazy proxy state. + # Without the guard, accessing pd.DataFrame raises ModuleNotFoundError. + monkeypatch.setattr(dependencies, "_PANDAS_AVAILABLE", False) + monkeypatch.setattr( + udf_module, + "pd", + dependencies._LazyModule("pandas", module_available=False), + ) + + dataset.add_columns(mapper, read_columns=["caption"], batch_size=64) + + expected = table.append_column("wm", pa.array([True, False])) + assert dataset.to_table() == expected + + def test_query_after_merge(tmp_path): # https://github.com/lancedb/lance/issues/1905 tab = pa.table( From b6f8936ec1823a1aec3bb910859bb9622400fdd1 Mon Sep 17 00:00:00 2001 From: George Stamatakis <126914070+gstamatakis95@users.noreply.github.com> Date: Sun, 7 Jun 2026 04:51:41 +0200 Subject: [PATCH 047/177] perf(index): avoid HEAD call when opening vector indexes (#7064) Closes #6944 ## Problem Opening a vector index made an extra network request to find out how big the index file was. We do not need that request. The file size is already saved in the manifest, in `IndexMetadata.files`. `Dataset::open_vector_index` opens `index.idx` just to read the footer and work out the format version. The plain `open` call does not know the file size, so the reader sends a HEAD request to get it. This happens for every vector index, not only HNSW. On the modern reader path the file is then opened a second time with the size already known, so the first HEAD was wasted. The same wasted HEAD also happened on the HNSW auxiliary file open (the case named in the issue), on the legacy remap path, and on legacy detail inference. ## Changes Added one small helper, `open_index_file`. It reads the size from the manifest (`IndexMetadata::file_size_map()`) and opens the file with `open_with_size`. If the size is not recorded, which is the case for older indices, it falls back to the plain `open`. The helper is now used everywhere a vector index file is opened: - `index.rs`, the main open that detects the format version (IVF_PQ, IVF_RQ, FLAT). This also removes a duplicate `file_size_map()` call in the same path. - `vector.rs`, the `IVF_HNSW_PQ` and `IVF_HNSW_SQ` auxiliary file opens. - `ivf.rs`, the legacy v1 `remap_index_file`. - `details.rs`, the legacy `infer_vector_index_details` fallback. ## Testing - New test `test_open_index_file_skips_head_when_size_known`. It wraps the store in a proxy that counts metadata reads against the index file. The result is 0 HEAD requests when the size is known and 1 HEAD on the older fallback path. - `cargo fmt --all` - `cargo clippy -p lance --tests -- -D warnings`, clean. - `cargo test -p lance --lib index::vector::`, 208 passed. --- rust/lance/src/index.rs | 11 +- rust/lance/src/index/vector.rs | 155 ++++++++++++++++++++++++- rust/lance/src/index/vector/details.rs | 10 +- rust/lance/src/index/vector/ivf.rs | 12 +- 4 files changed, 182 insertions(+), 6 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index cc6dad8d1f3..96eb7f88d32 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1877,7 +1877,15 @@ impl DatasetIndexInternalExt for Dataset { let frag_reuse_index = self.open_frag_reuse_index(metrics).await?; let index_dir = self.indice_files_dir(&index_meta)?; let index_file = index_dir.clone().join(uuid).join(INDEX_FILE_NAME); - let reader: Arc = object_store.open(&index_file).await?.into(); + let file_sizes = index_meta.file_size_map(); + let reader: Arc = vector::open_index_file( + object_store.as_ref(), + &index_file, + INDEX_FILE_NAME, + &file_sizes, + ) + .await? + .into(); let tailing_bytes = read_last_block(reader.as_ref()).await?; let (major_version, minor_version) = read_version(&tailing_bytes)?; @@ -1944,7 +1952,6 @@ impl DatasetIndexInternalExt for Dataset { self.object_store.clone(), SchedulerConfig::max_bandwidth(&self.object_store), ); - let file_sizes = index_meta.file_size_map(); let cached_size = file_sizes .get(INDEX_FILE_NAME) .map(|&size| CachedFileSize::new(size)) diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 588c96ab781..3a9afeca886 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -52,6 +52,7 @@ use lance_index::vector::{ sq::{ScalarQuantizer, builder::SQBuildParams}, }; use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, IndexType}; +use lance_io::object_store::ObjectStore; use lance_io::traits::Reader; use lance_linalg::distance::*; use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; @@ -1588,6 +1589,24 @@ pub(crate) async fn open_vector_index( Ok(idx) } +/// Open an index file without a HEAD request when the size is already known. +/// +/// `file_sizes` maps a file name to its size in bytes (see +/// `IndexMetadata::file_size_map`). If `file_name` is missing, which is the case +/// for older indices that did not record sizes, this falls back to `open`, which +/// issues a HEAD to learn the size. +pub(crate) async fn open_index_file( + object_store: &ObjectStore, + path: &Path, + file_name: &str, + file_sizes: &HashMap, +) -> Result> { + match file_sizes.get(file_name) { + Some(&size) => object_store.open_with_size(path, size as usize).await, + None => object_store.open(path).await, + } +} + #[instrument(level = "debug", skip(dataset, reader))] pub(crate) async fn open_vector_index_v2( dataset: Arc, @@ -1612,11 +1631,18 @@ pub(crate) async fn open_vector_index_v2( .ok_or_else(|| Error::index(format!("Index with id {} does not exist", uuid)))?; let index_dir = dataset.indice_files_dir(&index_meta)?; let object_store = dataset.object_store_for_index(&index_meta).await?; + let file_sizes = index_meta.file_size_map(); let index: Arc = match index_metadata.index_type.as_str() { "IVF_HNSW_PQ" => { let aux_path = index_dir.clone().join(uuid).join(INDEX_AUXILIARY_FILE_NAME); - let aux_reader = object_store.open(&aux_path).await?; + let aux_reader = open_index_file( + object_store.as_ref(), + &aux_path, + INDEX_AUXILIARY_FILE_NAME, + &file_sizes, + ) + .await?; let ivf_data = IvfModel::load(&reader).await?; let options = HNSWIndexOptions { use_residual: true }; @@ -1643,7 +1669,13 @@ pub(crate) async fn open_vector_index_v2( "IVF_HNSW_SQ" => { let aux_path = index_dir.clone().join(uuid).join(INDEX_AUXILIARY_FILE_NAME); - let aux_reader = object_store.open(&aux_path).await?; + let aux_reader = open_index_file( + object_store.as_ref(), + &aux_path, + INDEX_AUXILIARY_FILE_NAME, + &file_sizes, + ) + .await?; let ivf_data = IvfModel::load(&reader).await?; let options = HNSWIndexOptions { @@ -1960,6 +1992,125 @@ mod tests { use lance_index::metrics::NoOpMetricsCollector; use lance_linalg::distance::MetricType; + /// `open_index_file` skips the HEAD when the size is known and still falls + /// back to a HEAD for older indices that did not record sizes. A HEAD is + /// issued as a `get_opts` call with `head = true`, so a proxy store counts + /// those against the index file. + /// + /// Regression test for . + #[tokio::test] + async fn test_open_index_file_skips_head_when_size_known() { + use lance_index::INDEX_FILE_NAME; + use lance_io::assert_io_eq; + use lance_io::object_store::{ObjectStoreParams, ObjectStoreRegistry}; + + let (store, base) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "memory:///", + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + + let path = base.join(INDEX_FILE_NAME); + // Larger than the block size so size discovery needs a separate HEAD. + let data = vec![7u8; 2 * store.block_size()]; + store.put(&path, &data).await.unwrap(); + + let file_sizes = HashMap::from([(INDEX_FILE_NAME.to_string(), data.len() as u64)]); + + // Size recorded in the manifest, so reading the size issues no HEAD. + let _ = store.io_stats_incremental(); // reset + let reader = open_index_file(store.as_ref(), &path, INDEX_FILE_NAME, &file_sizes) + .await + .unwrap(); + assert_eq!(reader.size().await.unwrap(), data.len()); + let stats = store.io_stats_incremental(); + assert_io_eq!( + stats, + read_iops, + 0, + "a known file size must not trigger a HEAD request" + ); + + // Size unknown, as in an older index, so it falls back to a HEAD. + let _ = store.io_stats_incremental(); // reset + let reader = open_index_file(store.as_ref(), &path, INDEX_FILE_NAME, &HashMap::new()) + .await + .unwrap(); + assert_eq!(reader.size().await.unwrap(), data.len()); + let stats = store.io_stats_incremental(); + assert_io_eq!( + stats, + read_iops, + 1, + "an unknown file size must fall back to exactly one HEAD request" + ); + } + + /// `open_index_file` looks up sizes in `IndexMetadata::file_size_map()` by + /// bare file name. This pins that a freshly created HNSW index records both + /// the main and auxiliary files under those exact names with nonzero sizes, + /// which is what lets the open path skip the HEAD. + #[tokio::test] + async fn test_hnsw_index_records_file_sizes() { + use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_FILE_NAME}; + + let test_dir = TempStrDir::default(); + let uri = format!("{}/ds", test_dir.as_str()); + + let reader = lance_datagen::gen_batch() + .col("vector", array::rand_vec::(32.into())) + .into_reader_rows(RowCount::from(400), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, &uri, None).await.unwrap(); + + let params = VectorIndexParams::with_ivf_hnsw_pq_params( + MetricType::L2, + IvfBuildParams { + num_partitions: Some(8), + ..Default::default() + }, + HnswBuildParams { + max_level: 6, + m: 24, + ef_construction: 120, + prefetch_distance: None, + }, + PQBuildParams { + num_sub_vectors: 8, + num_bits: 8, + ..Default::default() + }, + ); + dataset + .create_index( + &["vector"], + IndexType::Vector, + Some("hnsw".to_string()), + ¶ms, + false, + ) + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + let index = indices.iter().find(|idx| idx.name == "hnsw").unwrap(); + let file_sizes = index.file_size_map(); + + assert!( + file_sizes.get(INDEX_FILE_NAME).copied().unwrap_or(0) > 0, + "manifest should record a nonzero {INDEX_FILE_NAME} size, got {file_sizes:?}" + ); + assert!( + file_sizes + .get(INDEX_AUXILIARY_FILE_NAME) + .copied() + .unwrap_or(0) + > 0, + "manifest should record a nonzero {INDEX_AUXILIARY_FILE_NAME} size, got {file_sizes:?}" + ); + } + #[tokio::test] async fn test_initialize_vector_index_ivf_pq() { let test_dir = TempStrDir::default(); diff --git a/rust/lance/src/index/vector/details.rs b/rust/lance/src/index/vector/details.rs index 83e9b92c209..63f9375792e 100644 --- a/rust/lance/src/index/vector/details.rs +++ b/rust/lance/src/index/vector/details.rs @@ -503,7 +503,15 @@ pub async fn infer_vector_index_details( let index_dir = dataset.indice_files_dir(index)?; let file_dir = index_dir.clone().join(uuid.as_str()); let index_file = file_dir.clone().join(INDEX_FILE_NAME); - let reader: Arc = dataset.object_store.open(&index_file).await?.into(); + let file_sizes = index.file_size_map(); + let reader: Arc = super::open_index_file( + dataset.object_store.as_ref(), + &index_file, + INDEX_FILE_NAME, + &file_sizes, + ) + .await? + .into(); let tailing_bytes = read_last_block(reader.as_ref()).await?; let (major_version, minor_version) = read_version(&tailing_bytes)?; diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index b7e6567b025..2f3cc4588fa 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -13,7 +13,9 @@ use super::{ utils::PartitionLoadLock, }; use crate::dataset::index::dataset_format_version; +use crate::index::DatasetIndexExt; use crate::index::DatasetIndexInternalExt; +use crate::index::vector::open_index_file; use crate::index::vector::utils::{get_vector_dim, get_vector_type}; use crate::{ dataset::Dataset, @@ -1842,7 +1844,15 @@ pub(crate) async fn remap_index_file( let old_path = dataset.indices_dir().join(old_uuid).join(INDEX_FILE_NAME); let new_path = dataset.indices_dir().join(new_uuid).join(INDEX_FILE_NAME); - let reader: Arc = object_store.open(&old_path).await?.into(); + let file_sizes = dataset + .load_index(old_uuid) + .await? + .map(|index| index.file_size_map()) + .unwrap_or_default(); + let reader: Arc = + open_index_file(object_store, &old_path, INDEX_FILE_NAME, &file_sizes) + .await? + .into(); let mut writer = object_store.create(&new_path).await?; let tasks = generate_remap_tasks(&index.ivf.offsets, &index.ivf.lengths)?; From 4c9ae20f63b35f7d26f313ea542efbc6b6485a93 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Jun 2026 16:48:41 +0800 Subject: [PATCH 048/177] docs: clarify distributed vector model scopes (#7148) ## Summary Clarifies the distributed vector indexing docs to describe both supported model scopes: shared model artifacts and independent per-segment models. The guide and Python API docs now explain that independently trained IVF/IVF-PQ segments can be committed together because query execution searches each physical segment by UUID, while physical merge remains constrained by compatible model metadata. --- docs/src/guide/distributed_indexing.md | 31 ++++++++++++++++++++++++++ python/python/lance/dataset.py | 20 +++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/docs/src/guide/distributed_indexing.md b/docs/src/guide/distributed_indexing.md index ae17b9bb0f2..ae4531331d0 100644 --- a/docs/src/guide/distributed_indexing.md +++ b/docs/src/guide/distributed_indexing.md @@ -105,6 +105,37 @@ or merged into larger segments: Within a single commit, built segments must have disjoint fragment coverage. +### Vector Model Scope + +Distributed vector builds support two model scopes. + +**Shared model artifacts**: the caller trains or provides IVF centroids once and +passes the same artifacts to every worker. For IVF-PQ segments that should be +physically mergeable, workers should also use the same PQ codebook. This makes +partition ids and quantizer state have the same meaning across segments. + +**Independent segment models**: each worker trains the IVF/PQ model for its own +`fragment_ids`. The resulting segments can be committed together as one logical +index without sharing centroids or codebooks. + +At query time, Lance searches each physical segment independently: + +1. Lance opens each segment by index UUID +2. each segment ranks IVF partitions using its own centroids +3. each segment searches the selected partitions using its own quantizer storage +4. Lance merges the candidate rows from all segments by `_distance` + +Because partition ids are interpreted only within a segment during this fanout +query path, independently trained committed segments can return valid results. +For L2 and cosine IVF-PQ, each segment computes residuals against its own IVF +centroid during both build and query, so distances remain estimates of the +original query-to-vector metric. + +Physical merge is a separate operation. It rewrites several segment artifacts +into one artifact with one model metadata scope. Use shared compatible model +artifacts for segments you plan to merge physically, or keep independently +trained segments as separate physical segments. + ## Internal Finalize Model Internally, Lance models distributed vector segment build as: diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 38796e97439..c8132f83ba4 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3801,6 +3801,17 @@ def create_index( to the dataset. The returned metadata can be passed to ``merge_existing_index_segments(...)`` if grouping is needed and then committed with ``commit_existing_index_segments(...)``. + + Vector segments support both shared and independent model scopes. If + the caller provides the same IVF centroids, and for IVF_PQ the same + PQ codebook, to each worker, the resulting segments share model + semantics and are suitable for workflows that physically merge + compatible segments. If those artifacts are omitted, each segment can + train its own IVF/PQ model for its assigned fragments. Such segments + can be committed together and are queried independently by segment + UUID; partition ids are interpreted within each segment's own model. + Keep independently trained segments as separate physical segments + unless the merge workflow can preserve or reconcile the model state. index_uuid : str, optional A UUID to use for the segment written by this call. If not provided, a new UUID will be generated. @@ -4017,6 +4028,15 @@ def create_index_uncommitted( requirement: - ``fragment_ids`` must be provided + - Vector segments support both shared and independent model scopes. Pass + the same IVF centroids, and for IVF_PQ the same PQ codebook, to each + worker when segments need shared model semantics or physical merge + compatibility. If these artifacts are omitted, each segment may train + its own IVF/PQ model and can be committed with other segments as one + logical index; query execution searches each segment by UUID and + interprets partition ids within that segment. Keep independently + trained segments as separate physical segments unless the merge + workflow can preserve or reconcile the model state. - ``rabitq_model`` (``IVF_RQ`` only): a JSON string produced by ``lance.lance.indices.build_rq_model``. It must be identical across all workers for their segments to be mergeable, since it pins the RaBitQ From fb1676e51c0a7585cc1ea0d07ac4534d7e648d35 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 8 Jun 2026 22:00:55 +0800 Subject: [PATCH 049/177] perf!: avoid listing index files after writes (#7129) Avoids listing index directories after new index writes by propagating `IndexFile` metadata from writer and builder APIs into manifest metadata. This covers scalar index writers, V3 vector builders, vector append/remap paths, and distributed vector merge while preserving legacy and extension fallbacks where the API cannot return files yet. This reduces extra object-store list/stat IO on fresh index creation and maintenance paths, with IO-count regression coverage for scalar and V3 vector index creation. --- rust/lance-index/src/scalar.rs | 17 +- rust/lance-index/src/scalar/bitmap.rs | 59 +++--- rust/lance-index/src/scalar/bloomfilter.rs | 20 +- rust/lance-index/src/scalar/btree.rs | 48 +++-- rust/lance-index/src/scalar/fmindex.rs | 32 +-- rust/lance-index/src/scalar/inverted.rs | 4 +- .../src/scalar/inverted/builder.rs | 187 ++++++++++-------- rust/lance-index/src/scalar/inverted/index.rs | 26 ++- rust/lance-index/src/scalar/json.rs | 6 +- rust/lance-index/src/scalar/label_list.rs | 19 +- rust/lance-index/src/scalar/lance_format.rs | 76 ++++--- rust/lance-index/src/scalar/ngram.rs | 31 ++- rust/lance-index/src/scalar/rtree.rs | 34 ++-- rust/lance-index/src/scalar/zonemap.rs | 20 +- .../src/vector/distributed/index_merger.rs | 14 +- rust/lance/src/index.rs | 125 +++++++++++- rust/lance/src/index/append.rs | 17 +- rust/lance/src/index/create.rs | 40 ++-- rust/lance/src/index/scalar/bitmap.rs | 2 +- rust/lance/src/index/scalar/btree.rs | 2 +- rust/lance/src/index/scalar/inverted.rs | 2 +- rust/lance/src/index/vector.rs | 123 +++++++----- rust/lance/src/index/vector/builder.rs | 55 ++++-- rust/lance/src/index/vector/ivf.rs | 181 ++++++++++------- rust/lance/src/index/vector/ivf/v2.rs | 7 +- 25 files changed, 689 insertions(+), 458 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 772dfaf4089..d0378b389c8 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -53,13 +53,6 @@ use lance_datafusion::udf::CONTAINS_TOKENS_UDF; pub const LANCE_SCALAR_INDEX: &str = "__lance_scalar_index"; -/// Summary of a completed index file write. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct IndexWriteSummary { - /// The final size of the index file in bytes. - pub size_bytes: u64, -} - /// Builtin index types supported by the Lance library /// /// This is primarily for convenience to avoid a bunch of string @@ -193,12 +186,12 @@ pub trait IndexWriter: Send { )) } /// Finishes writing the file and closes the file - async fn finish(&mut self) -> Result; + async fn finish(&mut self) -> Result; /// Finishes writing the file and closes the file with additional metadata async fn finish_with_metadata( &mut self, metadata: HashMap, - ) -> Result; + ) -> Result; } /// Trait for reading an index (or parts of an index) from storage @@ -288,10 +281,10 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf { /// Copy a range of batches from an index file from this store to another /// /// This is often useful when remapping or updating - async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()>; + async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result; /// Rename an index file - async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()>; + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result; /// Delete an index file (used in the tmp spill store to keep tmp size down) async fn delete_index_file(&self, name: &str) -> Result<()>; @@ -879,7 +872,7 @@ pub struct CreatedIndex { /// /// This enables skipping HEAD calls when opening indices and provides /// visibility into index storage size via describe_indices(). - pub files: Option>, + pub files: Vec, } /// The criteria that specifies how to update an index diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 10254e699c5..2436b642af7 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -36,7 +36,7 @@ use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use tracing::{instrument, warn}; -use super::{AnyQuery, IndexStore, ScalarIndex}; +use super::{AnyQuery, IndexFile, IndexStore, ScalarIndex}; use super::{ BuiltinIndexType, SargableQuery, ScalarIndexParams, SearchResult, btree::OrderableScalarValue, }; @@ -768,13 +768,15 @@ impl ScalarIndex for BitmapIndex { ) -> Result { let state = self.load_bitmap_index_state().await?; let remapped_state = BitmapIndexPlugin::remap_bitmap_state(state, mapping); - BitmapIndexPlugin::write_bitmap_index(remapped_state, dest_store, &self.value_type).await?; + let file = + BitmapIndexPlugin::write_bitmap_index(remapped_state, dest_store, &self.value_type) + .await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -785,7 +787,7 @@ impl ScalarIndex for BitmapIndex { dest_store: &dyn IndexStore, _old_data_filter: Option, ) -> Result { - BitmapIndexPlugin::streaming_build_and_write( + let file = BitmapIndexPlugin::streaming_build_and_write( new_data, Some(self), dest_store, @@ -797,7 +799,7 @@ impl ScalarIndex for BitmapIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -870,7 +872,7 @@ impl BitmapBatchWriter { } /// Flush any remaining data, write index statistics, and finalize the file. - async fn finish(mut self) -> Result<()> { + async fn finish(mut self) -> Result { self.flush().await?; let stats_json = serde_json::to_string(&BitmapStatistics { num_bitmaps: self.num_bitmaps, @@ -878,8 +880,7 @@ impl BitmapBatchWriter { .map_err(|e| Error::internal(format!("failed to serialize bitmap statistics: {e}")))?; let mut metadata = HashMap::new(); metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json); - self.file.finish_with_metadata(metadata).await?; - Ok(()) + self.file.finish_with_metadata(metadata).await } } @@ -1188,7 +1189,7 @@ impl BitmapIndexPlugin { state: HashMap, index_store: &dyn IndexStore, value_type: &DataType, - ) -> Result<()> { + ) -> Result { Self::write_bitmap_index_with_extras( state, index_store, @@ -1206,7 +1207,7 @@ impl BitmapIndexPlugin { value_type: &DataType, mut metadata: HashMap, global_buffers: Vec<(String, Bytes)>, - ) -> Result<()> { + ) -> Result { let num_bitmaps = state.len(); let schema = Arc::new(Schema::new(vec![ Field::new("keys", value_type.clone(), true), @@ -1270,9 +1271,7 @@ impl BitmapIndexPlugin { .map_err(|e| Error::internal(format!("failed to serialize bitmap statistics: {e}")))?; metadata.insert(INDEX_STATS_METADATA_KEY.to_string(), stats_json); - bitmap_index_file.finish_with_metadata(metadata).await?; - - Ok(()) + bitmap_index_file.finish_with_metadata(metadata).await } /// Builds bitmap index state from a `(value, row_id)` stream without writing it. @@ -1301,7 +1300,7 @@ impl BitmapIndexPlugin { pub async fn train_bitmap_index( data: SendableRecordBatchStream, index_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result { Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME).await } @@ -1311,15 +1310,15 @@ impl BitmapIndexPlugin { fragment_ids: &[u32], shard_id: Option, progress: Arc, - ) -> Result<()> { + ) -> Result { let partition_id = bitmap_shard_partition_id(fragment_ids, shard_id)?; let file_name = bitmap_shard_file_name(partition_id); progress .stage_start("build_bitmap_shard", None, "rows") .await?; - Self::streaming_build_and_write(data, None, index_store, &file_name).await?; + let file = Self::streaming_build_and_write(data, None, index_store, &file_name).await?; progress.stage_complete("build_bitmap_shard").await?; - Ok(()) + Ok(file) } /// Builds and writes a bitmap index in a streaming fashion from value-sorted @@ -1334,7 +1333,7 @@ impl BitmapIndexPlugin { old_index: Option<&BitmapIndex>, index_store: &dyn IndexStore, output_file_name: &str, - ) -> Result<()> { + ) -> Result { let value_type = data_source.schema().field(0).data_type().clone(); let mut writer = @@ -1427,9 +1426,7 @@ impl BitmapIndexPlugin { writer.emit(null_key, &idx.null_map).await?; } - writer.finish().await?; - - Ok(()) + writer.finish().await } /// Flush a completed value-run from the new data stream, emitting any @@ -1520,7 +1517,7 @@ impl BitmapIndexPlugin { store: &dyn IndexStore, shard_files: &[String], progress: Arc, - ) -> Result<()> { + ) -> Result { progress .stage_start("merge_bitmap_shards", None, "bitmaps") .await?; @@ -1570,10 +1567,10 @@ impl BitmapIndexPlugin { progress .stage_start("write_bitmap_index", Some(1), "files") .await?; - writer.finish().await?; + let file = writer.finish().await?; progress.stage_progress("write_bitmap_index", 1).await?; progress.stage_complete("write_bitmap_index").await?; - Ok(()) + Ok(file) } } @@ -1640,14 +1637,14 @@ pub async fn merge_bitmap_indices( progress .stage_start("write_bitmap_index", Some(1), "files") .await?; - BitmapIndexPlugin::write_bitmap_index(merged_state, dest_store, &value_type).await?; + let file = BitmapIndexPlugin::write_bitmap_index(merged_state, dest_store, &value_type).await?; progress.stage_progress("write_bitmap_index", 1).await?; progress.stage_complete("write_bitmap_index").await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()).unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -1712,7 +1709,7 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { .to_string(), ) })?; - if let Some(fragment_ids) = fragment_ids.as_ref() { + let file = if let Some(fragment_ids) = fragment_ids.as_ref() { Self::train_bitmap_shard( data, index_store, @@ -1720,20 +1717,20 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { request.parameters.shard_id, progress, ) - .await?; + .await? } else if request.parameters.shard_id.is_some() { return Err(Error::invalid_input( "Bitmap shard_id requires fragment_ids and is only supported for distributed shard builds" .to_string(), )); } else { - Self::train_bitmap_index(data, index_store).await?; - } + Self::train_bitmap_index(data, index_store).await? + }; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BitmapIndexDetails::default()) .unwrap(), index_version: BITMAP_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 13057658e92..bb8b82a4a4d 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -12,7 +12,7 @@ use crate::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, }; use crate::scalar::{ - BloomFilterQuery, BuiltinIndexType, CreatedIndex, ScalarIndexParams, UpdateCriteria, + BloomFilterQuery, BuiltinIndexType, CreatedIndex, IndexFile, ScalarIndexParams, UpdateCriteria, }; use crate::{Any, pb}; use arrow_array::{Array, UInt64Array}; @@ -458,13 +458,13 @@ impl ScalarIndex for BloomFilterIndex { // Write the combined zones back to storage let mut builder = BloomFilterIndexBuilder::try_new(params)?; builder.blocks = updated_blocks; - builder.write_index(dest_store).await?; + let file = builder.write_index(dest_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default()) .unwrap(), index_version: BLOOMFILTER_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -620,7 +620,7 @@ impl BloomFilterIndexBuilder { Ok(RecordBatch::try_new(schema, columns)?) } - pub async fn write_index(self, index_store: &dyn IndexStore) -> Result<()> { + pub async fn write_index(self, index_store: &dyn IndexStore) -> Result { let record_batch = self.bloomfilter_stats_as_batch()?; let mut file_schema = record_batch.schema().as_ref().clone(); @@ -638,8 +638,7 @@ impl BloomFilterIndexBuilder { .new_index_file(BLOOMFILTER_FILENAME, Arc::new(file_schema)) .await?; index_file.write_record_batch(record_batch).await?; - index_file.finish().await?; - Ok(()) + index_file.finish().await } } @@ -986,13 +985,12 @@ impl BloomFilterIndexPlugin { batches_source: SendableRecordBatchStream, index_store: &dyn IndexStore, options: Option, - ) -> Result<()> { + ) -> Result { let mut builder = BloomFilterIndexBuilder::try_new(options.unwrap_or_default())?; builder.train(batches_source).await?; - builder.write_index(index_store).await?; - Ok(()) + builder.write_index(index_store).await } } @@ -1076,12 +1074,12 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { "must provide training request created by new_training_request".into(), ) })?; - Self::train_bloomfilter_index(data, index_store, Some(request.params)).await?; + let file = Self::train_bloomfilter_index(data, index_store, Some(request.params)).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::BloomFilterIndexDetails::default()) .unwrap(), index_version: BLOOMFILTER_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index a650e8db244..5a32e8b15ca 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -11,7 +11,7 @@ use std::{ }; use super::{ - AnyQuery, BuiltinIndexType, IndexReader, IndexStore, IndexWriter, MetricsCollector, + AnyQuery, BuiltinIndexType, IndexFile, IndexReader, IndexStore, IndexWriter, MetricsCollector, OldIndexDataFilter, SargableQuery, ScalarIndex, ScalarIndexParams, SearchResult, compute_next_prefix, }; @@ -1578,13 +1578,14 @@ impl BTreeIndex { )?; let merged_stream = chunk_concat_stream(unchunked, first.batch_size as usize); - train_btree_index(merged_stream, dest_store, first.batch_size, None, None).await?; + let files = + train_btree_index(merged_stream, dest_store, first.batch_size, None, None).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) .unwrap(), index_version: BTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } } @@ -1864,6 +1865,7 @@ impl ScalarIndex for BTreeIndex { let mapping = Arc::new(mapping.clone()); let train_schema = Arc::new(self.train_schema()); + let mut remapped_files = Vec::new(); // TODO: Could potentially parallelize this across parts, unclear it would be worth it for (part_id, page_file) in part_page_files { @@ -1894,7 +1896,10 @@ impl ScalarIndex for BTreeIndex { remapped_stream, )); - train_btree_index(remapped_stream, dest_store, self.batch_size, None, part_id).await?; + let mut files = + train_btree_index(remapped_stream, dest_store, self.batch_size, None, part_id) + .await?; + remapped_files.append(&mut files); } if let Some(ranges_to_files) = &self.ranges_to_files { @@ -1906,7 +1911,7 @@ impl ScalarIndex for BTreeIndex { let lookup_files = (0..num_parts) .map(|part_id| part_lookup_file_path((part_id as u64) << 32)) .collect::>(); - merge_metadata_files( + let merged_files = merge_metadata_files( dest_store, &page_files, &lookup_files, @@ -1914,13 +1919,15 @@ impl ScalarIndex for BTreeIndex { noop_progress(), ) .await?; + remapped_files.retain(|file| file.path.ends_with("_page_data.lance")); + remapped_files.extend(merged_files); } Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) .unwrap(), index_version: BTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: remapped_files, }) } @@ -2075,7 +2082,7 @@ pub async fn train_btree_index( batch_size: u64, fragment_ids: Option>, range_id: Option, -) -> Result<()> { +) -> Result> { // Create `partition_id` for distributed index building. // This ID serves as a high-level mask (first 32 bits of a u64) to ensure // that index partitions generated by different workers do not conflict. @@ -2140,7 +2147,7 @@ pub async fn train_btree_index( ); batch_idx += 1; } - sub_index_file.finish().await?; + let pages_file = sub_index_file.finish().await?; let record_batch = btree_stats_as_batch(encoded_batches, &value_type)?; let mut file_schema = record_batch.schema().as_ref().clone(); file_schema @@ -2166,8 +2173,8 @@ pub async fn train_btree_index( } }; btree_index_file.write_record_batch(record_batch).await?; - btree_index_file.finish().await?; - Ok(()) + let lookup_file = btree_index_file.finish().await?; + Ok(vec![pages_file, lookup_file]) } fn find_single_partition_files( @@ -2225,7 +2232,7 @@ async fn merge_metadata_files( part_lookup_files: &[String], batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result> { if part_lookup_files.is_empty() || part_page_files.is_empty() { return Err(Error::internal( "No partition files provided for merging".to_string(), @@ -2301,6 +2308,7 @@ async fn merge_metadata_files( progress, ) .await + .map(|file| vec![file]) } else { merge_pages_and_lookups( store, @@ -2354,7 +2362,7 @@ async fn merge_range_partitioned_lookups( batch_size: u64, batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result { let sorted_part_lookup_files = sort_files_by_partition_id(part_lookup_files)?; let mut lookup_file = store .new_index_file(BTREE_LOOKUP_NAME, lookup_schema) @@ -2394,12 +2402,12 @@ async fn merge_range_partitioned_lookups( serde_json::to_string(&pages_per_file)?, ); - lookup_file.finish_with_metadata(metadata).await?; + let lookup_file = lookup_file.finish_with_metadata(metadata).await?; progress.stage_complete("merge_lookups").await?; // In this mode, we only clean up lookup files, and page files are untouched. cleanup_partition_files(store, part_lookup_files, &[]).await; - Ok(()) + Ok(lookup_file) } /// Merges partition files using a K-way sort-merge algorithm. @@ -2418,7 +2426,7 @@ async fn merge_pages_and_lookups( batch_size: u64, batch_readhead: Option, progress: Arc, -) -> Result<()> { +) -> Result> { // Create a new global page file let partition_id = extract_partition_id(part_lookup_files[0].as_str())?; let page_file = page_files_map.get(&partition_id).unwrap(); @@ -2441,7 +2449,7 @@ async fn merge_pages_and_lookups( progress.clone(), ) .await?; - page_file.finish().await?; + let page_file = page_file.finish().await?; progress.stage_complete("merge_pages").await?; let lookup_batch = RecordBatch::try_new( @@ -2466,7 +2474,7 @@ async fn merge_pages_and_lookups( .stage_start("write_lookup_file", Some(1), "files") .await?; lookup_file.write_record_batch(lookup_batch).await?; - lookup_file.finish_with_metadata(metadata).await?; + let lookup_file = lookup_file.finish_with_metadata(metadata).await?; progress.stage_progress("write_lookup_file", 1).await?; progress.stage_complete("write_lookup_file").await?; @@ -2474,7 +2482,7 @@ async fn merge_pages_and_lookups( // Only perform deletion after files are successfully written, ensuring debug information is not lost in case of failure cleanup_partition_files(store, part_lookup_files, part_page_files).await; - Ok(()) + Ok(vec![page_file, lookup_file]) } // Adjust local_page_idx_ in each look-up file to create a contiguous global_page_idx @@ -2883,7 +2891,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { The `range_id` field will be removed in a future release." ); } - train_btree_index( + let files = train_btree_index( data, index_store, request @@ -2898,7 +2906,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { index_details: prost_types::Any::from_msg(&pbold::BTreeIndexDetails::default()) .unwrap(), index_version: BTREE_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files, }) } diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index 331be04c538..c5a43d691f7 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -42,8 +42,8 @@ use crate::scalar::registry::{ DefaultTrainingRequest, ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, }; use crate::scalar::{ - AnyQuery, BuiltinIndexType, CreatedIndex, IndexStore, OldIndexDataFilter, ScalarIndex, - ScalarIndexParams, SearchResult, TextQuery, UpdateCriteria, + AnyQuery, BuiltinIndexType, CreatedIndex, IndexFile, IndexStore, OldIndexDataFilter, + ScalarIndex, ScalarIndexParams, SearchResult, TextQuery, UpdateCriteria, }; use crate::vector::VectorIndex; use crate::{Index, IndexType}; @@ -1330,11 +1330,11 @@ impl ScalarIndex for FMIndexScalarIndex { _: Option, ) -> Result { let texts = collect_texts(new_data).await?; - write_partitioned_fmindex(&texts, dest).await?; + let files = write_partitioned_fmindex(&texts, dest).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, - files: Some(dest.list_files_with_sizes().await?), + files, }) } fn update_criteria(&self) -> UpdateCriteria { @@ -1449,7 +1449,7 @@ fn hex_decode(s: &str) -> Result> { /// - Wavelet block rows (BWT nodes) /// - SA sample blocks (packed u64 in LargeBinary) /// - Metadata: c_table, huffman_codes, tree_topology, row_ids, doc_start_positions -async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> Result<()> { +async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> Result { let schema = Arc::new(FMIndex::block_schema()); let mut writer = store.new_index_file(filename, schema.clone()).await?; @@ -1519,22 +1519,26 @@ async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> .collect(); metadata.insert("doc_start_positions".into(), hex_encode(&doc_starts_bytes)); - writer.finish_with_metadata(metadata).await?; - Ok(()) + writer.finish_with_metadata(metadata).await } -async fn write_partitioned_fmindex(texts: &[(u64, Vec)], store: &dyn IndexStore) -> Result<()> { +async fn write_partitioned_fmindex( + texts: &[(u64, Vec)], + store: &dyn IndexStore, +) -> Result> { let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); if refs.is_empty() { let fm = FMIndex::build(&[])?; - write_fmindex(&fm, store, &fmindex_partition_path(0)).await?; - return Ok(()); + return Ok(vec![ + write_fmindex(&fm, store, &fmindex_partition_path(0)).await?, + ]); } + let mut files = Vec::new(); for (pid, chunk) in refs.chunks(PARTITION_SIZE).enumerate() { let fm = FMIndex::build(chunk)?; - write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?; + files.push(write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?); } - Ok(()) + Ok(files) } // ── Plugin ─────────────────────────────────────────────────────────────────── @@ -1574,11 +1578,11 @@ impl ScalarIndexPlugin for FMIndexPlugin { _progress: Arc, ) -> Result { let texts = collect_texts(data).await?; - write_partitioned_fmindex(&texts, store).await?; + let files = write_partitioned_fmindex(&texts, store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, - files: Some(store.list_files_with_sizes().await?), + files, }) } fn provides_exact_answer(&self) -> bool { diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs index b3e497a82ca..d0bb0e40d3a 100644 --- a/rust/lance-index/src/scalar/inverted.rs +++ b/rust/lance-index/src/scalar/inverted.rs @@ -151,11 +151,11 @@ impl InvertedIndexPlugin { let mut inverted_index = InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask) .with_progress(progress); - inverted_index.update(data, index_store, None).await?; + let files = inverted_index.update(data, index_store, None).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: current_fts_format_version().index_version(), - files: Some(index_store.list_files_with_sizes().await?), + files, }) } diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 437ea30a730..283806ed32f 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -8,7 +8,7 @@ use crate::scalar::inverted::json::JsonTextStream; use crate::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer; #[cfg(test)] use crate::scalar::lance_format::LanceIndexStore; -use crate::scalar::{IndexStore, OldIndexDataFilter}; +use crate::scalar::{IndexFile, IndexStore, OldIndexDataFilter}; use crate::vector::graph::OrderedFloat; use crate::{progress::IndexBuildProgress, progress::noop_progress}; use arrow::array::AsArray; @@ -288,7 +288,7 @@ impl InvertedIndexBuilder { new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, old_data_filter: Option, - ) -> Result<()> { + ) -> Result> { let schema = new_data.schema(); let doc_col = schema.field(0).name(); @@ -305,15 +305,15 @@ impl InvertedIndexBuilder { self.progress .stage_start("tokenize_docs", None, "rows") .await?; - self.update_index(new_data, dest_store).await?; + let mut files = self.update_index(new_data, dest_store).await?; if let Some(OldIndexDataFilter::Fragments { to_remove, .. }) = old_data_filter { self.deleted_fragments.extend(to_remove); } self.progress.stage_complete("tokenize_docs").await?; - self.write(dest_store).await?; - Ok(()) + files.extend(self.write(dest_store).await?); + Ok(files) } pub async fn update_from_segments( @@ -322,7 +322,7 @@ impl InvertedIndexBuilder { dest_store: &dyn IndexStore, old_segments: &[Arc], old_data_filter: Option, - ) -> Result<()> { + ) -> Result> { let schema = new_data.schema(); let doc_col = schema.field(0).name(); @@ -332,7 +332,8 @@ impl InvertedIndexBuilder { self.params.lance_tokenizer = Some(doc_type.as_ref().to_string()); } - self.merge_existing_segments(dest_store, old_segments, old_data_filter.as_ref()) + let mut files = self + .merge_existing_segments(dest_store, old_segments, old_data_filter.as_ref()) .await?; let new_data = document_input(new_data, doc_col)?; @@ -340,11 +341,11 @@ impl InvertedIndexBuilder { self.progress .stage_start("tokenize_docs", None, "rows") .await?; - self.update_index(new_data, dest_store).await?; + files.extend(self.update_index(new_data, dest_store).await?); self.progress.stage_complete("tokenize_docs").await?; - self.write(dest_store).await?; - Ok(()) + files.extend(self.write(dest_store).await?); + Ok(files) } async fn merge_existing_segments( @@ -352,10 +353,11 @@ impl InvertedIndexBuilder { dest_store: &dyn IndexStore, old_segments: &[Arc], old_data_filter: Option<&crate::scalar::OldIndexDataFilter>, - ) -> Result<()> { + ) -> Result> { let num_workers = resolve_num_workers(&self.params); let memory_limit_bytes = resolve_worker_memory_limit_bytes(&self.params, num_workers); let mut merged: Option = None; + let mut files = Vec::new(); for index in old_segments { if old_data_filter.is_none() { self.deleted_fragments @@ -382,7 +384,7 @@ impl InvertedIndexBuilder { > u32::MAX as usize; if would_exceed_memory || would_exceed_doc_ids { let builder = std::mem::replace(merged, partition_builder); - self.write_new_partition(dest_store, builder).await?; + files.extend(self.write_new_partition(dest_store, builder).await?); } else { merged.merge_from(partition_builder)?; } @@ -393,21 +395,21 @@ impl InvertedIndexBuilder { } if let Some(builder) = merged { - self.write_new_partition(dest_store, builder).await?; + files.extend(self.write_new_partition(dest_store, builder).await?); } - Ok(()) + Ok(files) } async fn write_new_partition( &mut self, dest_store: &dyn IndexStore, mut builder: InnerBuilder, - ) -> Result<()> { + ) -> Result> { let partition_id = self.next_partition_id() | self.fragment_mask.unwrap_or(0); builder.set_id(partition_id); - builder.write(dest_store).await?; + let files = builder.write(dest_store).await?; self.new_partitions.push(partition_id); - Ok(()) + Ok(files) } fn next_partition_id(&self) -> u64 { @@ -424,7 +426,7 @@ impl InvertedIndexBuilder { &mut self, stream: SendableRecordBatchStream, dest_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result> { let num_workers = resolve_num_workers(&self.params); let tokenizer = self.params.build()?; let with_position = self.params.with_position; @@ -507,9 +509,11 @@ impl InvertedIndexBuilder { // wait for the workers to finish let start = std::time::Instant::now(); let mut tail_partitions = Vec::new(); + let mut files = Vec::new(); for index_task in index_tasks { let output = index_task.await??; self.new_partitions.extend(output.partitions); + files.extend(output.files); if let Some(tail_partition) = output.tail_partition { tail_partitions.push(tail_partition); } @@ -519,10 +523,10 @@ impl InvertedIndexBuilder { if let Some(builder) = merged_tail_partitions { self.new_partitions.push(builder.id()); let mut builder = builder; - builder.write(dest_store.as_ref()).await?; + files.extend(builder.write(dest_store.as_ref()).await?); } log::info!("wait workers indexing elapsed: {:?}", start.elapsed()); - Result::Ok(()) + Result::Ok(files) }; index_build.await @@ -533,7 +537,8 @@ impl InvertedIndexBuilder { mapping: &HashMap>, src_store: Arc, dest_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result> { + let mut files = Vec::new(); for part in self.partitions.iter() { let part = InvertedPartition::load( src_store.clone(), @@ -545,20 +550,24 @@ impl InvertedIndexBuilder { .await?; let mut builder = part.into_builder().await?; builder.remap(mapping).await?; - builder.write(dest_store).await?; + files.extend(builder.write(dest_store).await?); } if self.fragment_mask.is_none() { - self.write_metadata(dest_store, &self.partitions).await?; + files.push(self.write_metadata(dest_store, &self.partitions).await?); } else { // in distributed mode, the part_temp_metadata is written by the worker for &partition_id in &self.partitions { - self.write_part_metadata(dest_store, partition_id).await?; + files.push(self.write_part_metadata(dest_store, partition_id).await?); } } - Ok(()) + Ok(files) } - async fn write_metadata(&self, dest_store: &dyn IndexStore, partitions: &[u64]) -> Result<()> { + async fn write_metadata( + &self, + dest_store: &dyn IndexStore, + partitions: &[u64], + ) -> Result { let mut serialized_deleted_fragments = Vec::with_capacity(self.deleted_fragments.serialized_size()); self.deleted_fragments @@ -607,8 +616,7 @@ impl InvertedIndexBuilder { .new_index_file(METADATA_FILE, metadata_file_schema) .await?; writer.write_record_batch(record_batch).await?; - writer.finish_with_metadata(metadata).await?; - Ok(()) + writer.finish_with_metadata(metadata).await } /// Write partition metadata file for a single partition @@ -619,7 +627,7 @@ impl InvertedIndexBuilder { &self, dest_store: &dyn IndexStore, partition: u64, // Modify parameter type - ) -> Result<()> { + ) -> Result { let partitions = vec![partition]; let mut metadata = HashMap::from_iter(vec![ ("partitions".to_owned(), serde_json::to_string(&partitions)?), @@ -652,30 +660,30 @@ impl InvertedIndexBuilder { let mut writer = dest_store .new_index_file(&file_name, Arc::new(Schema::empty())) .await?; - writer.finish_with_metadata(metadata).await?; - Ok(()) + writer.finish_with_metadata(metadata).await } async fn write_metadata_with_progress( &self, dest_store: &dyn IndexStore, partitions: &[u64], - ) -> Result<()> { + ) -> Result> { let total = if self.fragment_mask.is_none() { Some(1) } else { Some(partitions.len() as u64) }; + let mut files = Vec::new(); self.progress .stage_start("write_metadata", total, "files") .await?; if self.fragment_mask.is_none() { - self.write_metadata(dest_store, partitions).await?; + files.push(self.write_metadata(dest_store, partitions).await?); self.progress.stage_progress("write_metadata", 1).await?; } else { let mut completed = 0; for &partition_id in partitions { - self.write_part_metadata(dest_store, partition_id).await?; + files.push(self.write_part_metadata(dest_store, partition_id).await?); completed += 1; self.progress .stage_progress("write_metadata", completed) @@ -683,10 +691,10 @@ impl InvertedIndexBuilder { } } self.progress.stage_complete("write_metadata").await?; - Ok(()) + Ok(files) } - async fn write(&self, dest_store: &dyn IndexStore) -> Result<()> { + async fn write(&self, dest_store: &dyn IndexStore) -> Result> { let mut partitions = Vec::with_capacity(self.partitions.len() + self.new_partitions.len()); partitions.extend_from_slice(&self.partitions); partitions.extend_from_slice(&self.new_partitions); @@ -700,22 +708,29 @@ impl InvertedIndexBuilder { ) .await?; let mut copied = 0; + let mut files = Vec::new(); for part in self.partitions.iter() { - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&token_file_path(*part), dest_store) - .await?; - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&posting_file_path(*part), dest_store) - .await?; - self.src_store - .as_ref() - .expect("existing partitions require a source store") - .copy_index_file(&doc_file_path(*part), dest_store) - .await?; + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file(&token_file_path(*part), dest_store) + .await?, + ); + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file(&posting_file_path(*part), dest_store) + .await?, + ); + files.push( + self.src_store + .as_ref() + .expect("existing partitions require a source store") + .copy_index_file(&doc_file_path(*part), dest_store) + .await?, + ); copied += 1; self.progress .stage_progress("copy_partitions", copied) @@ -729,9 +744,11 @@ impl InvertedIndexBuilder { } self.progress.stage_complete("copy_partitions").await?; - self.write_metadata_with_progress(dest_store, &partitions) - .await?; - Ok(()) + files.extend( + self.write_metadata_with_progress(dest_store, &partitions) + .await?, + ); + Ok(files) } } @@ -968,12 +985,14 @@ impl InnerBuilder { + posting_lists_size } - pub async fn write(&mut self, store: &dyn IndexStore) -> Result<()> { + pub async fn write(&mut self, store: &dyn IndexStore) -> Result> { let docs = Arc::new(std::mem::take(&mut self.docs)); - self.write_posting_lists(store, docs.clone()).await?; - self.write_tokens(store).await?; - self.write_docs(store, docs).await?; - Ok(()) + let files = vec![ + self.write_posting_lists(store, docs.clone()).await?, + self.write_tokens(store).await?, + self.write_docs(store, docs).await?, + ]; + Ok(files) } #[instrument(level = "debug", skip_all)] @@ -981,7 +1000,7 @@ impl InnerBuilder { &mut self, store: &dyn IndexStore, docs: Arc, - ) -> Result<()> { + ) -> Result { let id = self.id; let mut writer = store .new_index_file( @@ -1067,12 +1086,11 @@ impl InnerBuilder { buffer_id.to_string(), ); } - writer.finish_with_metadata(extra_metadata).await?; - Ok(()) + writer.finish_with_metadata(extra_metadata).await } #[instrument(level = "debug", skip_all)] - async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result<()> { + async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result { log::info!("writing tokens of partition {}", self.id); let tokens = std::mem::take(&mut self.tokens); let batch = tokens.to_batch(self.token_set_format)?; @@ -1080,20 +1098,18 @@ impl InnerBuilder { .new_index_file(&token_file_path(self.id), batch.schema()) .await?; writer.write_record_batch(batch).await?; - writer.finish().await?; - Ok(()) + writer.finish().await } #[instrument(level = "debug", skip_all)] - async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc) -> Result<()> { + async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc) -> Result { log::info!("writing docs of partition {}", self.id); let batch = docs.to_batch()?; let mut writer = store .new_index_file(&doc_file_path(self.id), batch.schema()) .await?; writer.write_record_batch(batch).await?; - writer.finish().await?; - Ok(()) + writer.finish().await } } @@ -1103,6 +1119,7 @@ struct IndexWorker { id_alloc: Arc, builder: InnerBuilder, partitions: Vec, + files: Vec, schema: SchemaRef, memory_size: u64, worker_memory_limit_bytes: u64, @@ -1119,6 +1136,7 @@ struct TailPartition { struct WorkerOutput { partitions: Vec, + files: Vec, tail_partition: Option, } @@ -1185,6 +1203,7 @@ impl IndexWorker { config.format_version, ), partitions: Vec::new(), + files: Vec::new(), id_alloc, schema, memory_size: 0, @@ -1411,7 +1430,7 @@ impl IndexWorker { ); let written_partition_id = builder.id(); let mut builder = builder; - builder + let files = builder .write(self.dest_store.as_ref()) .await .map_err(|err| { @@ -1420,6 +1439,7 @@ impl IndexWorker { written_partition_id )) })?; + self.files.extend(files); self.partitions.push(written_partition_id); Ok(()) } @@ -1434,6 +1454,7 @@ impl IndexWorker { }; Ok(WorkerOutput { partitions: self.partitions, + files: self.files, tail_partition, }) } @@ -2055,7 +2076,7 @@ mod tests { use super::*; use crate::metrics::NoOpMetricsCollector; use crate::progress::IndexBuildProgress; - use crate::scalar::{IndexFile, IndexReader, IndexWriteSummary, IndexWriter, ScalarIndex}; + use crate::scalar::{IndexFile, IndexReader, IndexWriter, ScalarIndex}; use arrow_array::{RecordBatch, StringArray, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; @@ -2227,6 +2248,7 @@ mod tests { #[derive(Debug)] struct CountingWriter { + path: String, write_count: Arc, } @@ -2242,15 +2264,21 @@ mod tests { Ok(1) } - async fn finish(&mut self) -> Result { - Ok(IndexWriteSummary { size_bytes: 0 }) + async fn finish(&mut self) -> Result { + Ok(IndexFile { + path: self.path.clone(), + size_bytes: 0, + }) } async fn finish_with_metadata( &mut self, _metadata: HashMap, - ) -> Result { - Ok(IndexWriteSummary { size_bytes: 0 }) + ) -> Result { + Ok(IndexFile { + path: self.path.clone(), + size_bytes: 0, + }) } } @@ -2270,10 +2298,11 @@ mod tests { async fn new_index_file( &self, - _name: &str, + name: &str, _schema: Arc, ) -> Result> { Ok(Box::new(CountingWriter { + path: name.to_string(), write_count: self.write_count.clone(), })) } @@ -2284,13 +2313,17 @@ mod tests { )) } - async fn copy_index_file(&self, _name: &str, _dest_store: &dyn IndexStore) -> Result<()> { + async fn copy_index_file( + &self, + _name: &str, + _dest_store: &dyn IndexStore, + ) -> Result { Err(Error::not_supported( "CountingStore does not support copying", )) } - async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result<()> { + async fn rename_index_file(&self, _name: &str, _new_name: &str) -> Result { Err(Error::not_supported( "CountingStore does not support renaming", )) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index a6e94cd1a93..1ce17f53244 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -538,7 +538,7 @@ impl InvertedIndex { .with_token_set_format(first.token_set_format) .with_format_version(first.format_version()) .with_posting_tail_codec(first.posting_tail_codec()); - builder + let files = builder .update_from_segments(new_data, dest_store, segments, old_data_filter) .await?; @@ -547,7 +547,7 @@ impl InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: first.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -1107,7 +1107,8 @@ impl ScalarIndex for InvertedIndex { mapping: &HashMap>, dest_store: &dyn IndexStore, ) -> Result { - self.to_builder() + let files = self + .to_builder() .remap(mapping, self.store.clone(), dest_store) .await?; @@ -1116,7 +1117,7 @@ impl ScalarIndex for InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: self.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -1126,7 +1127,8 @@ impl ScalarIndex for InvertedIndex { dest_store: &dyn IndexStore, old_data_filter: Option, ) -> Result { - self.to_builder() + let files = self + .to_builder() .update(new_data, dest_store, old_data_filter) .await?; @@ -1135,7 +1137,7 @@ impl ScalarIndex for InvertedIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&details).unwrap(), index_version: self.index_version(), - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -5949,10 +5951,18 @@ mod tests { Ok(reader) } } - async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()> { + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { self.inner.copy_index_file(name, dest_store).await } - async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()> { + async fn rename_index_file( + &self, + name: &str, + new_name: &str, + ) -> Result { self.inner.rename_index_file(name, new_name).await } async fn delete_index_file(&self, name: &str) -> Result<()> { diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index 6431909c9ce..81b5a0b57e3 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -130,7 +130,7 @@ impl ScalarIndex for JsonIndex { index_details: prost_types::Any::from_msg(&json_details)?, // TODO: We should store the target index version in the details index_version: JSON_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: target_created.files, }) } @@ -152,7 +152,7 @@ impl ScalarIndex for JsonIndex { index_details: prost_types::Any::from_msg(&json_details)?, // TODO: We should store the target index version in the details index_version: JSON_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: target_created.files, }) } @@ -785,7 +785,7 @@ impl ScalarIndexPlugin for JsonIndexPlugin { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&index_details)?, index_version: JSON_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: target_index.files, }) } diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 86b1bd6d3df..55cd392a1b7 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -27,7 +27,7 @@ use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; use roaring::RoaringBitmap; use tracing::instrument; -use super::{AnyQuery, IndexStore, LabelListQuery, ScalarIndex, bitmap::BitmapIndex}; +use super::{AnyQuery, IndexFile, IndexStore, LabelListQuery, ScalarIndex, bitmap::BitmapIndex}; use super::{BuiltinIndexType, SargableQuery, ScalarIndexParams}; use super::{MetricsCollector, SearchResult}; use crate::frag_reuse::FragReuseIndex; @@ -228,7 +228,7 @@ impl ScalarIndex for LabelListIndex { .copied() .unwrap_or(Some(addr_as_u64)) })); - write_label_list_bitmap_index( + let file = write_label_list_bitmap_index( remapped_state, dest_store, self.values_index.value_type(), @@ -240,7 +240,7 @@ impl ScalarIndex for LabelListIndex { index_details: prost_types::Any::from_msg(&pbold::LabelListIndexDetails::default()) .unwrap(), index_version: LABEL_LIST_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -262,13 +262,15 @@ impl ScalarIndex for LabelListIndex { if !new_nulls.is_empty() { merged_nulls |= &new_nulls; } - write_label_list_bitmap_index(merged_state, dest_store, &value_type, &merged_nulls).await?; + let file = + write_label_list_bitmap_index(merged_state, dest_store, &value_type, &merged_nulls) + .await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::LabelListIndexDetails::default()) .unwrap(), index_version: LABEL_LIST_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -475,7 +477,7 @@ async fn write_label_list_bitmap_index( store: &dyn IndexStore, value_type: &DataType, list_nulls: &RowAddrTreeMap, -) -> Result<()> { +) -> Result { BitmapIndexPlugin::write_bitmap_index_with_extras( state, store, @@ -672,12 +674,13 @@ impl ScalarIndexPlugin for LabelListIndexPlugin { let (state, value_type) = BitmapIndexPlugin::build_bitmap_index_state(data, HashMap::new()).await?; let list_nulls = list_nulls.lock().unwrap().clone(); - write_label_list_bitmap_index(state, index_store, &value_type, &list_nulls).await?; + let file = + write_label_list_bitmap_index(state, index_store, &value_type, &list_nulls).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::LabelListIndexDetails::default()) .unwrap(), index_version: LABEL_LIST_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 7fd9d12547d..79faee37e9e 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -3,7 +3,7 @@ //! Utilities for serializing and deserializing scalar indices in the lance format -use super::{IndexReader, IndexStore, IndexWriteSummary, IndexWriter}; +use super::{IndexFile, IndexReader, IndexStore, IndexWriter}; use arrow_array::RecordBatch; use arrow_schema::Schema; use async_trait::async_trait; @@ -23,7 +23,7 @@ use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_io::{ReadBatchParams, object_store::ObjectStore}; use lance_table::format::SelfDescribingFileReader; -use lance_table::format::{IndexFile, list_index_files_with_sizes}; +use lance_table::format::list_index_files_with_sizes; use object_store::path::Path; use std::cmp::min; use std::collections::HashMap; @@ -109,9 +109,10 @@ impl IndexWriter for PreviousFileWrit Ok(offset as u64) } - async fn finish(&mut self) -> Result { + async fn finish(&mut self) -> Result { Self::finish(self).await?; - Ok(IndexWriteSummary { + Ok(IndexFile { + path: String::new(), size_bytes: self.tell().await? as u64, }) } @@ -119,29 +120,36 @@ impl IndexWriter for PreviousFileWrit async fn finish_with_metadata( &mut self, metadata: HashMap, - ) -> Result { + ) -> Result { Self::finish_with_metadata(self, &metadata).await?; - Ok(IndexWriteSummary { + Ok(IndexFile { + path: String::new(), size_bytes: self.tell().await? as u64, }) } } +struct LanceIndexWriter { + path: String, + inner: current_writer::FileWriter, +} + #[async_trait] -impl IndexWriter for current_writer::FileWriter { +impl IndexWriter for LanceIndexWriter { async fn write_record_batch(&mut self, batch: RecordBatch) -> Result { - let offset = self.tell().await?; - self.write_batch(&batch).await?; + let offset = self.inner.tell().await?; + self.inner.write_batch(&batch).await?; Ok(offset) } async fn add_global_buffer(&mut self, data: Bytes) -> Result { - Self::add_global_buffer(self, data).await + self.inner.add_global_buffer(data).await } - async fn finish(&mut self) -> Result { - let summary = Self::finish(self).await?; - Ok(IndexWriteSummary { + async fn finish(&mut self) -> Result { + let summary = self.inner.finish().await?; + Ok(IndexFile { + path: self.path.clone(), size_bytes: summary.size_bytes, }) } @@ -149,12 +157,13 @@ impl IndexWriter for current_writer::FileWriter { async fn finish_with_metadata( &mut self, metadata: HashMap, - ) -> Result { + ) -> Result { metadata.into_iter().for_each(|(k, v)| { - self.add_schema_metadata(k, v); + self.inner.add_schema_metadata(k, v); }); - let summary = Self::finish(self).await?; - Ok(IndexWriteSummary { + let summary = self.inner.finish().await?; + Ok(IndexFile { + path: self.path.clone(), size_bytes: summary.size_bytes, }) } @@ -393,7 +402,10 @@ impl IndexStore for LanceIndexStore { ..Default::default() }, )?; - Ok(Box::new(writer)) + Ok(Box::new(LanceIndexWriter { + path: name.to_string(), + inner: writer, + })) } async fn open_index_file(&self, name: &str) -> Result> { @@ -433,7 +445,7 @@ impl IndexStore for LanceIndexStore { } } - async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()> { + async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result { let path = self.index_dir.clone().join(name); let other_store = dest_store.as_any().downcast_ref::(); @@ -443,7 +455,15 @@ impl IndexStore for LanceIndexStore { // This does blindly assume that both stores are using the same underlying object_store // but there is no easy way to verify this and it happens to always be true at the moment let dest_path = dest_store.index_dir.clone().join(name); - self.object_store.copy(&path, &dest_path).await + self.object_store.copy(&path, &dest_path).await?; + let size_bytes = match self.file_sizes.get(name) { + Some(size_bytes) => *size_bytes, + None => self.object_store.size(&path).await?, + }; + Ok(IndexFile { + path: name.to_string(), + size_bytes, + }) } _ => { let reader = self.open_index_file(name).await?; @@ -456,18 +476,24 @@ impl IndexStore for LanceIndexStore { let batch = reader.read_range(offset..next_offset, None).await?; writer.write_record_batch(batch).await?; } - writer.finish().await?; - - Ok(()) + writer.finish().await } } } - async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()> { + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { let path = self.index_dir.clone().join(name); let new_path = self.index_dir.clone().join(new_name); self.object_store.copy(&path, &new_path).await?; - self.object_store.delete(&path).await + self.object_store.delete(&path).await?; + let size_bytes = match self.file_sizes.get(name) { + Some(size_bytes) => *size_bytes, + None => self.object_store.size(&new_path).await?, + }; + Ok(IndexFile { + path: new_name.to_string(), + size_bytes, + }) } async fn delete_index_file(&self, name: &str) -> Result<()> { diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 748a8f5231c..cab1f37ee8d 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -9,7 +9,7 @@ use std::{collections::HashMap, sync::Arc}; use super::lance_format::LanceIndexStore; use super::{ - AnyQuery, BuiltinIndexType, IndexReader, IndexStore, IndexWriter, MetricsCollector, + AnyQuery, BuiltinIndexType, IndexFile, IndexReader, IndexStore, IndexWriter, MetricsCollector, ScalarIndex, ScalarIndexParams, SearchResult, TextQuery, }; use crate::frag_reuse::FragReuseIndex; @@ -504,13 +504,13 @@ impl ScalarIndex for NGramIndex { offset += BATCH_SIZE; } - writer.finish().await?; + let file = writer.finish().await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::NGramIndexDetails::default()) .unwrap(), index_version: NGRAM_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -523,7 +523,7 @@ impl ScalarIndex for NGramIndex { let mut builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default())?; let spill_files = builder.train(new_data).await?; - builder + let file = builder .write_index(dest_store, spill_files, Some(self.store.clone())) .await?; @@ -531,7 +531,7 @@ impl ScalarIndex for NGramIndex { index_details: prost_types::Any::from_msg(&pbold::NGramIndexDetails::default()) .unwrap(), index_version: NGRAM_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -1047,7 +1047,7 @@ impl NGramIndexBuilder { mut left_stream: impl Stream> + Unpin, mut right_stream: impl Stream> + Unpin, writer: &mut dyn IndexWriter, - ) -> Result<()> { + ) -> Result { let mut left_state = left_stream.try_next().await?; let mut right_state = right_stream.try_next().await?; @@ -1079,8 +1079,7 @@ impl NGramIndexBuilder { } } - writer.finish().await?; - Ok(()) + writer.finish().await } async fn merge_spill_files( @@ -1182,7 +1181,7 @@ impl NGramIndexBuilder { store: &dyn IndexStore, spill_files: Vec, old_index: Option>, - ) -> Result<()> { + ) -> Result { let mut writer = store .new_index_file(POSTINGS_FILENAME, POSTINGS_SCHEMA.clone()) .await?; @@ -1190,15 +1189,14 @@ impl NGramIndexBuilder { if spill_files.is_empty() { if let Some(old_index) = old_index { // An update with no new data, just copy the old index to the new store - old_index.copy_index_file(POSTINGS_FILENAME, store).await?; + return old_index.copy_index_file(POSTINGS_FILENAME, store).await; } else { // Training an index with no data, make an empty index let mut writer = store .new_index_file(POSTINGS_FILENAME, POSTINGS_SCHEMA.clone()) .await?; - writer.finish().await?; + return writer.finish().await; } - return Ok(()); } let mut index_to_copy = self.merge_spills(spill_files).await?; @@ -1222,8 +1220,7 @@ impl NGramIndexBuilder { offset += batch_size; } - writer.finish().await?; - Ok(()) + writer.finish().await } } @@ -1234,7 +1231,7 @@ impl NGramIndexPlugin { pub async fn train_ngram_index( batches_source: SendableRecordBatchStream, index_store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result { let mut builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default())?; let spill_files = builder.train(batches_source).await?; @@ -1300,12 +1297,12 @@ impl ScalarIndexPlugin for NGramIndexPlugin { )); } - Self::train_ngram_index(data, index_store).await?; + let file = Self::train_ngram_index(data, index_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::NGramIndexDetails::default()) .unwrap(), index_version: NGRAM_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/scalar/rtree.rs b/rust/lance-index/src/scalar/rtree.rs index e419c59baa6..5cefae36da6 100644 --- a/rust/lance-index/src/scalar/rtree.rs +++ b/rust/lance-index/src/scalar/rtree.rs @@ -10,8 +10,8 @@ use crate::scalar::registry::{ }; use crate::scalar::rtree::sort::Sorter; use crate::scalar::{ - AnyQuery, BuiltinIndexType, CreatedIndex, GeoQuery, IndexReader, IndexStore, IndexWriter, - ScalarIndex, ScalarIndexParams, SearchResult, UpdateCriteria, + AnyQuery, BuiltinIndexType, CreatedIndex, GeoQuery, IndexFile, IndexReader, IndexStore, + IndexWriter, ScalarIndex, ScalarIndexParams, SearchResult, UpdateCriteria, }; use crate::vector::VectorIndex; use crate::{Index, IndexType, pb}; @@ -593,7 +593,7 @@ impl ScalarIndex for RTreeIndex { num_items: self.metadata.num_items + stats.num_items, }; - RTreeIndexPlugin::train_rtree_index( + let files = RTreeIndexPlugin::train_rtree_index( merged_bbox_data, merge_stats, self.metadata.page_size, @@ -604,7 +604,7 @@ impl ScalarIndex for RTreeIndex { Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::RTreeIndexDetails::default())?, index_version: RTREE_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files, }) } @@ -830,7 +830,7 @@ impl RTreeIndexPlugin { total_bbox: BoundingBox, store: &dyn IndexStore, page_size: u32, - ) -> Result<()> { + ) -> Result { let mut page_idx: u64 = 0; let mut writer = store .new_index_file(RTREE_PAGES_NAME, RTREE_PAGE_SCHEMA.clone()) @@ -868,12 +868,13 @@ impl RTreeIndexPlugin { .finish_with_metadata( RTreeMetadata::new(page_size, page_idx, num_items, total_bbox).into_map(), ) - .await?; - - Ok(()) + .await } - pub async fn write_nulls(store: &dyn IndexStore, null_map: RowAddrTreeMap) -> Result<()> { + pub async fn write_nulls( + store: &dyn IndexStore, + null_map: RowAddrTreeMap, + ) -> Result { let mut writer = store .new_index_file(RTREE_NULLS_NAME, RTREE_NULLS_SCHEMA.clone()) .await?; @@ -885,8 +886,7 @@ impl RTreeIndexPlugin { )?; writer.write_record_batch(batch).await?; - writer.finish().await?; - Ok(()) + writer.finish().await } async fn train_rtree_index( @@ -894,12 +894,12 @@ impl RTreeIndexPlugin { stats: BboxStreamStats, page_size: u32, store: &dyn IndexStore, - ) -> Result<()> { + ) -> Result> { // new sorted stream let sorter = HilbertSorter::new(stats.total_bbox); let sorted_data = sorter.sort(bbox_data).await?; - Self::write_index( + let page_file = Self::write_index( sorted_data, stats.num_items, stats.total_bbox, @@ -908,9 +908,9 @@ impl RTreeIndexPlugin { ) .await?; - Self::write_nulls(store, stats.null_map).await?; + let nulls_file = Self::write_nulls(store, stats.null_map).await?; - Ok(()) + Ok(vec![page_file, nulls_file]) } } @@ -965,12 +965,12 @@ impl ScalarIndexPlugin for RTreeIndexPlugin { Self::process_and_analyze_bbox_stream(bbox_data, page_size, spill_store.clone()) .await?; - Self::train_rtree_index(bbox_data, stats, page_size, index_store).await?; + let files = Self::train_rtree_index(bbox_data, stats, page_size, index_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::RTreeIndexDetails::default())?, index_version: RTREE_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files, }) } diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 64601a64f96..60f66c91c59 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -19,7 +19,7 @@ use crate::scalar::registry::{ ScalarIndexPlugin, TrainingCriteria, TrainingOrdering, TrainingRequest, }; use crate::scalar::{ - BuiltinIndexType, CreatedIndex, SargableQuery, ScalarIndexParams, UpdateCriteria, + BuiltinIndexType, CreatedIndex, IndexFile, SargableQuery, ScalarIndexParams, UpdateCriteria, compute_next_prefix, }; use lance_arrow_stats::StatisticsAccumulator; @@ -639,13 +639,13 @@ impl ScalarIndex for ZoneMapIndex { let mut builder = ZoneMapIndexBuilder::try_new(options, self.data_type.clone())?; builder.options.rows_per_zone = self.rows_per_zone; builder.maps = updated_zones; - builder.write_index(dest_store).await?; + let file = builder.write_index(dest_store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::ZoneMapIndexDetails::default()) .unwrap(), index_version: ZONEMAP_INDEX_VERSION, - files: Some(dest_store.list_files_with_sizes().await?), + files: vec![file], }) } @@ -772,7 +772,7 @@ impl ZoneMapIndexBuilder { Ok(RecordBatch::try_new(schema, columns)?) } - pub async fn write_index(self, index_store: &dyn IndexStore) -> Result<()> { + pub async fn write_index(self, index_store: &dyn IndexStore) -> Result { let record_batch = self.zonemap_stats_as_batch()?; let mut file_schema = record_batch.schema().as_ref().clone(); @@ -785,8 +785,7 @@ impl ZoneMapIndexBuilder { .new_index_file(ZONEMAP_FILENAME, Arc::new(file_schema)) .await?; index_file.write_record_batch(record_batch).await?; - index_file.finish().await?; - Ok(()) + index_file.finish().await } } @@ -891,7 +890,7 @@ impl ZoneMapIndexPlugin { batches_source: SendableRecordBatchStream, index_store: &dyn IndexStore, options: Option, - ) -> Result<()> { + ) -> Result { // train_zonemap_index: calling scan_aligned_chunks let value_type = batches_source.schema().field(0).data_type().clone(); @@ -899,8 +898,7 @@ impl ZoneMapIndexPlugin { builder.train(batches_source).await?; - builder.write_index(index_store).await?; - Ok(()) + builder.write_index(index_store).await } } @@ -984,12 +982,12 @@ impl ScalarIndexPlugin for ZoneMapIndexPlugin { "must provide training request created by new_training_request".into(), ) })?; - Self::train_zonemap_index(data, index_store, Some(request.params)).await?; + let file = Self::train_zonemap_index(data, index_store, Some(request.params)).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pbold::ZoneMapIndexDetails::default()) .unwrap(), index_version: ZONEMAP_INDEX_VERSION, - files: Some(index_store.list_files_with_sizes().await?), + files: vec![file], }) } diff --git a/rust/lance-index/src/vector/distributed/index_merger.rs b/rust/lance-index/src/vector/distributed/index_merger.rs index e93984bbcca..4c6a752b2a6 100755 --- a/rust/lance-index/src/vector/distributed/index_merger.rs +++ b/rust/lance-index/src/vector/distributed/index_merger.rs @@ -709,7 +709,7 @@ pub async fn merge_partial_vector_auxiliary_files( aux_paths: &[object_store::path::Path], target_dir: &object_store::path::Path, progress: Arc, -) -> Result<()> { +) -> Result { if aux_paths.is_empty() { return Err(Error::index( "No partial auxiliary files were selected for merge".to_string(), @@ -1498,16 +1498,18 @@ pub async fn merge_partial_vector_auxiliary_files( } let dt2 = distance_type.ok_or_else(|| Error::index("Distance type missing".to_string()))?; write_unified_ivf_and_index_metadata(w, &ivf_model, dt2, idx_type_final).await?; - w.finish().await?; + let summary = w.finish().await?; progress.stage_progress("write_auxiliary_index", 1).await?; progress.stage_complete("write_auxiliary_index").await?; + Ok(lance_table::format::IndexFile { + path: INDEX_AUXILIARY_FILE_NAME.to_string(), + size_bytes: summary.size_bytes, + }) } else { - return Err(Error::index( + Err(Error::index( "Failed to initialize unified writer".to_string(), - )); + )) } - - Ok(()) } #[cfg(test)] diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 96eb7f88d32..c973fe1ad95 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -568,7 +568,7 @@ pub(crate) async fn remap_index( matched.index_version, matched.name )) })?; - remap_vector_index( + let files = remap_vector_index( Arc::new(dataset.clone()), &field_path, index_id, @@ -578,17 +578,13 @@ pub(crate) async fn remap_index( ) .await?; - // Capture file sizes for the vector index - let index_dir = dataset.indices_dir().join(new_id.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; - CreatedIndex { index_details: prost_types::Any::from_msg( &lance_index::pb::VectorIndexDetails::default(), ) .unwrap(), index_version, - files: Some(files), + files, } } _ => { @@ -604,7 +600,7 @@ pub(crate) async fn remap_index( new_id, index_details: created_index.index_details, index_version: created_index.index_version, - files: created_index.files, + files: Some(created_index.files), })) } @@ -1376,7 +1372,7 @@ impl DatasetIndexExt for Dataset { index_version: res.new_index_version, created_at: Some(chrono::Utc::now()), base_id: None, // New merged index file locates in the cloned dataset. - files: res.files, + files: Some(res.files), }; removed_indices.extend(res.removed_indices.iter().map(|&idx| idx.clone())); new_indices.push(new_idx); @@ -2587,7 +2583,7 @@ mod tests { kmeans::{KMeansParams, train_kmeans}, sq::builder::SQBuildParams, }; - use lance_io::{assert_io_eq, assert_io_lt}; + use lance_io::{assert_io_eq, assert_io_lt, utils::tracking_store::IoStats}; use lance_linalg::distance::{DistanceType, MetricType}; use lance_testing::datagen::generate_random_array; use object_store::ObjectStoreExt; @@ -2629,6 +2625,20 @@ mod tests { } } + fn list_io_stats(stats: &IoStats) -> IoStats { + let requests = stats + .requests + .iter() + .filter(|request| request.method == "list") + .cloned() + .collect::>(); + IoStats { + read_iops: requests.len() as u64, + requests, + ..Default::default() + } + } + fn segment_from_metadata(metadata: &IndexMetadata) -> IndexSegment { IndexSegment::new( metadata.uuid, @@ -7270,6 +7280,103 @@ mod tests { } } + #[tokio::test] + async fn test_scalar_index_create_does_not_list_files() { + let test_dir = TempStrDir::default(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("category", DataType::Int32, false), + ])); + let ids = Int32Array::from_iter_values(0..128); + let categories = Int32Array::from_iter_values((0..128).map(|value| value % 8)); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(categories)]) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + .await + .unwrap(); + let io_tracker = dataset.object_store.as_ref().io_tracker().clone(); + + io_tracker.incremental_stats(); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("category_bitmap".to_string()), + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let stats = io_tracker.incremental_stats(); + let list_stats = list_io_stats(&stats); + assert_io_eq!( + list_stats, + read_iops, + 0, + "new scalar index files should be reported by writer return values" + ); + } + + #[tokio::test] + async fn test_vector_index_create_does_not_list_files() { + let test_dir = TempStrDir::default(); + let dimension = 8; + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "vector", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + dimension, + ), + false, + ), + ])); + let ids = Int32Array::from_iter_values(0..256); + let vectors = (0..256) + .map(|row| { + Some( + (0..dimension) + .map(|dim| Some((row * dimension + dim) as f32)) + .collect::>(), + ) + }) + .collect::>(); + let vector_array = + FixedSizeListArray::from_iter_primitive::(vectors, dimension); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids), Arc::new(vector_array)]) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + .await + .unwrap(); + let io_tracker = dataset.object_store.as_ref().io_tracker().clone(); + + io_tracker.incremental_stats(); + dataset + .create_index( + &["vector"], + IndexType::Vector, + Some("vector_ivf_flat".to_string()), + &VectorIndexParams::ivf_flat(4, MetricType::L2), + true, + ) + .await + .unwrap(); + + let stats = io_tracker.incremental_stats(); + let list_stats = list_io_stats(&stats); + assert_io_eq!( + list_stats, + read_iops, + 0, + "new V3 vector index files should be reported by builder return values" + ); + } + #[tokio::test] async fn test_index_file_sizes_through_lifecycle() { use crate::dataset::WriteDestination; diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index a89b64df276..47e3f78ed05 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -16,7 +16,7 @@ use lance_index::{ }, }; use lance_select::{RowAddrTreeMap, RowSetOps}; -use lance_table::format::{Fragment, IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{Fragment, IndexMetadata}; use roaring::RoaringBitmap; use uuid::Uuid; @@ -37,7 +37,7 @@ pub struct IndexMergeResults<'a> { pub new_index_version: i32, pub new_index_details: prost_types::Any, /// List of files and their sizes for the merged index - pub files: Option>, + pub files: Vec, } async fn build_stable_row_id_filter( @@ -436,7 +436,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( vec![(selected_metadata, selected_index)], )?; let selected_ivf_view = selected_logical_index.as_ivf()?; - let (new_uuid, indices_merged) = Box::pin(optimize_vector_indices( + let (new_uuid, indices_merged, files) = Box::pin(optimize_vector_indices( dataset.as_ref().clone(), Option::< lance_io::stream::RecordBatchStreamAdapter< @@ -452,8 +452,6 @@ pub async fn merge_indices_with_unindexed_frags<'a>( return Ok(None); } - let index_dir = dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; let new_fragment_bitmap = removed_segment .effective_fragment_bitmap(&dataset.fragment_bitmap) .or_else(|| removed_segment.fragment_bitmap.clone()) @@ -466,7 +464,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( CreatedIndex { index_details: vector_index_details_default(), index_version: lance_index::IndexType::Vector.version() as u32, - files: Some(files), + files, }, )) } else { @@ -488,7 +486,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( Some(scanner.try_into_stream().await?) }; - let (new_uuid, indices_merged) = optimize_vector_indices( + let (new_uuid, indices_merged, files) = optimize_vector_indices( dataset.as_ref().clone(), new_data_stream, &field_path, @@ -519,9 +517,6 @@ pub async fn merge_indices_with_unindexed_frags<'a>( .map(|d| d.as_ref().clone()) .unwrap_or_else(vector_index_details_default); - let index_dir = dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; - Ok(( new_uuid, removed_indices, @@ -532,7 +527,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( // index_version <= our max supported version, so we can safely // write the current library's version for this index type. index_version: lance_index::IndexType::Vector.version() as u32, - files: Some(files), + files, }, )) } diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index ce8e65d8356..45b0baa38d6 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -8,8 +8,7 @@ use crate::{ transaction::{Operation, TransactionBuilder}, }, index::{ - DatasetIndexExt, DatasetIndexInternalExt, IntoIndexSegment, - build_index_metadata_from_segments, + DatasetIndexExt, DatasetIndexInternalExt, scalar::{build_bitmap_index_segment, build_scalar_index}, vector::{ LANCE_VECTOR_INDEX, VectorIndexParams, build_distributed_vector_index, @@ -366,12 +365,12 @@ impl<'a> CreateIndexBuilder<'a> { })?; let index_version = vec_params.index_type().version() as u32; - if train { + let files = if train { // Check if this is distributed indexing (fragment-level) if let Some(fragments) = &self.fragments { // For distributed indexing, build only on specified fragments // This creates temporary index metadata without committing - let segment_uuid = Box::pin(build_distributed_vector_index( + let (segment_uuid, files) = Box::pin(build_distributed_vector_index( self.dataset, column, &index_name, @@ -383,6 +382,7 @@ impl<'a> CreateIndexBuilder<'a> { )) .await?; output_index_uuid = segment_uuid; + files } else { // Standard full dataset indexing Box::pin(build_vector_index( @@ -394,7 +394,7 @@ impl<'a> CreateIndexBuilder<'a> { fri, self.progress.clone(), )) - .await?; + .await? } } else { // Create empty vector index @@ -405,19 +405,12 @@ impl<'a> CreateIndexBuilder<'a> { &index_id.to_string(), vec_params, ) - .await?; - } - // Capture file sizes after vector index creation - let index_dir = self - .dataset - .indices_dir() - .join(output_index_uuid.to_string()); - let files = - list_index_files_with_sizes(&self.dataset.object_store, &index_dir).await?; + .await? + }; CreatedIndex { index_details: vector_index_details(vec_params), index_version, - files: Some(files), + files, } } // Can't use if let Some(...) here because it's not stable yet. @@ -456,7 +449,7 @@ impl<'a> CreateIndexBuilder<'a> { CreatedIndex { index_details: vector_index_details_default(), index_version: self.index_type.version() as u32, - files: Some(files), + files, } } (IndexType::FragmentReuse, _) => { @@ -489,7 +482,7 @@ impl<'a> CreateIndexBuilder<'a> { index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), }) } @@ -509,22 +502,11 @@ impl<'a> CreateIndexBuilder<'a> { vec![] }; let transaction = if uses_segment_commit_path(self.index_type, self.params) { - let field_id = *new_idx.fields.first().ok_or_else(|| { - Error::internal(format!( - "Index '{}' is missing field ids after build", - new_idx.name - )) - })?; - let index_name = new_idx.name.clone(); let dataset_version = new_idx.dataset_version; - let segments = vec![new_idx.into_index_segment()?]; - let new_indices = - build_index_metadata_from_segments(self.dataset, &index_name, field_id, segments) - .await?; TransactionBuilder::new( dataset_version, Operation::CreateIndex { - new_indices, + new_indices: vec![new_idx], removed_indices, }, ) diff --git a/rust/lance/src/index/scalar/bitmap.rs b/rust/lance/src/index/scalar/bitmap.rs index 11214a9bfdc..0eec39a0c38 100644 --- a/rust/lance/src/index/scalar/bitmap.rs +++ b/rust/lance/src/index/scalar/bitmap.rs @@ -70,7 +70,7 @@ pub(in crate::index) async fn merge_segments( index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), ..segments[0].clone() }) } diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs index 34534f6811b..b860b117dc7 100644 --- a/rust/lance/src/index/scalar/btree.rs +++ b/rust/lance/src/index/scalar/btree.rs @@ -188,6 +188,6 @@ pub(crate) async fn merge_segments( index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), }) } diff --git a/rust/lance/src/index/scalar/inverted.rs b/rust/lance/src/index/scalar/inverted.rs index 44cf6ff2e08..2caa6376d85 100644 --- a/rust/lance/src/index/scalar/inverted.rs +++ b/rust/lance/src/index/scalar/inverted.rs @@ -137,7 +137,7 @@ pub(crate) async fn merge_segments( index_version: created_index.index_version as i32, created_at: Some(chrono::Utc::now()), base_id: None, - files: created_index.files, + files: Some(created_index.files), ..segments[0].clone() }) } diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 3a9afeca886..3d0d922fde7 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -18,7 +18,7 @@ mod fixture_test; use self::{ivf::*, pq::PQIndex}; use arrow_schema::{DataType, Schema}; -use builder::IvfIndexBuilder; +use builder::{IvfIndexBuilder, VectorIndexBuildSummary}; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use futures::stream; @@ -55,7 +55,7 @@ use lance_index::{INDEX_AUXILIARY_FILE_NAME, INDEX_METADATA_SCHEMA_KEY, IndexTyp use lance_io::object_store::ObjectStore; use lance_io::traits::Reader; use lance_linalg::distance::*; -use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata}; use serde::Serialize; use tracing::instrument; use utils::get_vector_type; @@ -595,7 +595,7 @@ pub(crate) async fn build_distributed_vector_index( frag_reuse_index: Option>, fragment_ids: &[u32], progress: Arc, -) -> Result { +) -> Result<(Uuid, Vec)> { let (element_type, index_type, ivf_params, shuffler) = prepare_vector_segment_build( dataset, column, @@ -661,7 +661,7 @@ pub(crate) async fn build_distributed_vector_index( DataType::Float16 | DataType::Float32 | DataType::Float64 => { let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -677,11 +677,12 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } DataType::UInt8 => { let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -697,6 +698,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { return Err(Error::index(format!( @@ -725,7 +727,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); let global_pq = make_global_pq(pq_params)?; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -745,6 +747,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } } } @@ -756,7 +759,7 @@ pub(crate) async fn build_distributed_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -771,6 +774,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfHnswFlat => { @@ -783,7 +787,7 @@ pub(crate) async fn build_distributed_vector_index( match element_type { DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -798,9 +802,10 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -815,6 +820,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } } } @@ -836,7 +842,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); let global_pq = make_global_pq(pq_params)?; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -856,6 +862,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfHnswSq => { @@ -871,7 +878,7 @@ pub(crate) async fn build_distributed_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -886,6 +893,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } IndexType::IvfRq => { @@ -898,7 +906,7 @@ pub(crate) async fn build_distributed_vector_index( let ivf_model = make_ivf_model(); - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( filtered_dataset, column.to_owned(), index_dir.clone(), @@ -917,6 +925,7 @@ pub(crate) async fn build_distributed_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok((segment_uuid, summary.files)); } _ => { @@ -925,9 +934,7 @@ pub(crate) async fn build_distributed_vector_index( index_type ))); } - }; - - Ok(segment_uuid) + } } /// Build a Vector Index @@ -940,7 +947,7 @@ pub(crate) async fn build_vector_index( params: &VectorIndexParams, frag_reuse_index: Option>, progress: Arc, -) -> Result<()> { +) -> Result> { let (element_type, index_type, ivf_params, shuffler) = prepare_vector_segment_build( dataset, column, @@ -955,7 +962,7 @@ pub(crate) async fn build_vector_index( match index_type { IndexType::IvfFlat => match element_type { DataType::Float16 | DataType::Float32 | DataType::Float64 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -969,9 +976,10 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -985,6 +993,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { return Err(Error::index(format!( @@ -1004,7 +1013,7 @@ pub(crate) async fn build_vector_index( match params.version { IndexFileVersion::Legacy => { - build_ivf_pq_index( + let files = build_ivf_pq_index( dataset, column, name, @@ -1015,6 +1024,7 @@ pub(crate) async fn build_vector_index( progress.clone(), ) .await?; + return Ok(files); } IndexFileVersion::V3 => { let mut builder = IvfIndexBuilder::::new( @@ -1029,11 +1039,12 @@ pub(crate) async fn build_vector_index( frag_reuse_index, )?; - builder + let summary = builder .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } } } @@ -1045,7 +1056,7 @@ pub(crate) async fn build_vector_index( ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -1059,6 +1070,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfRq => { let StageParams::RQ(rq_params) = &stages[1] else { @@ -1080,11 +1092,12 @@ pub(crate) async fn build_vector_index( frag_reuse_index, )?; - builder + let summary = builder .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfHnswFlat => { let StageParams::Hnsw(hnsw_params) = &stages[1] else { @@ -1095,7 +1108,7 @@ pub(crate) async fn build_vector_index( }; match element_type { DataType::UInt8 => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -1109,9 +1122,10 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -1125,6 +1139,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } } } @@ -1141,7 +1156,7 @@ pub(crate) async fn build_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -1155,6 +1170,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } IndexType::IvfHnswSq => { let StageParams::Hnsw(hnsw_params) = &stages[1] else { @@ -1169,7 +1185,7 @@ pub(crate) async fn build_vector_index( stages ))); }; - IvfIndexBuilder::::new( + let summary = IvfIndexBuilder::::new( dataset.clone(), column.to_owned(), dataset.indices_dir().clone().join(uuid), @@ -1183,6 +1199,7 @@ pub(crate) async fn build_vector_index( .with_progress(progress.clone()) .build() .await?; + return Ok(summary.files); } _ => { return Err(Error::index(format!( @@ -1190,8 +1207,7 @@ pub(crate) async fn build_vector_index( index_type ))); } - }; - Ok(()) + } } /// Build a Vector Index incrementally using an existing index's IVF model and quantizer @@ -1205,7 +1221,7 @@ pub(crate) async fn build_vector_index_incremental( existing_index: Arc, frag_reuse_index: Option>, progress: Arc, -) -> Result<()> { +) -> Result { let stages = ¶ms.stages; if stages.is_empty() { @@ -1265,7 +1281,7 @@ pub(crate) async fn build_vector_index_incremental( match (sub_index_type, quantization_type) { // IVF_FLAT (SubIndexType::Flat, QuantizationType::Flat) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1280,9 +1296,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } (SubIndexType::Flat, QuantizationType::FlatBin) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1297,6 +1314,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_PQ (SubIndexType::Flat, QuantizationType::Product) => { @@ -1310,17 +1328,18 @@ pub(crate) async fn build_vector_index_incremental( frag_reuse_index, OptimizeOptions::append(), )?; - builder + let summary = builder .with_ivf(ivf_model) .with_quantizer(quantizer.try_into()?) .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_SQ (SubIndexType::Flat, QuantizationType::Scalar) => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1335,6 +1354,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_RQ (SubIndexType::Flat, QuantizationType::Rabit) => { @@ -1348,13 +1368,14 @@ pub(crate) async fn build_vector_index_incremental( frag_reuse_index, OptimizeOptions::append(), )?; - builder + let summary = builder .with_ivf(ivf_model) .with_quantizer(quantizer.try_into()?) .with_transpose(!params.skip_transpose) .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } // IVF_HNSW variants (SubIndexType::Hnsw, quantization_type) => { @@ -1367,7 +1388,7 @@ pub(crate) async fn build_vector_index_incremental( match quantization_type { QuantizationType::Flat => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1382,9 +1403,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::FlatBin => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1399,9 +1421,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Product => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1416,9 +1439,10 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Scalar => { - IvfIndexBuilder::::new_incremental( + let summary = IvfIndexBuilder::::new_incremental( dataset.clone(), column.to_owned(), index_dir, @@ -1433,6 +1457,7 @@ pub(crate) async fn build_vector_index_incremental( .with_progress(progress.clone()) .build() .await?; + return Ok(summary); } QuantizationType::Rabit => { return Err(Error::index( @@ -1442,8 +1467,6 @@ pub(crate) async fn build_vector_index_incremental( } } } - - Ok(()) } /// Build an empty vector index without training on data @@ -1454,7 +1477,7 @@ pub(crate) async fn build_empty_vector_index( name: &str, _uuid: &str, _params: &VectorIndexParams, -) -> Result<()> { +) -> Result> { // For now, return a NotImplementedError to indicate this functionality // is still being developed Err(Error::not_supported_source( @@ -1475,13 +1498,13 @@ pub(crate) async fn remap_vector_index( new_uuid: &Uuid, old_metadata: &IndexMetadata, mapping: &HashMap>, -) -> Result<()> { +) -> Result> { let old_index = dataset .open_vector_index(column, &old_uuid.to_string(), &NoOpMetricsCollector) .await?; if let Some(ivf_index) = old_index.as_any().downcast_ref::() { - remap_index_file( + let file = remap_index_file( dataset.as_ref(), &old_uuid.to_string(), &new_uuid.to_string(), @@ -1496,9 +1519,10 @@ pub(crate) async fn remap_vector_index( vec![], ) .await?; + Ok(vec![file]) } else { // it's v3 index - remap_index_file_v3( + let files = remap_index_file_v3( dataset.as_ref(), &new_uuid.to_string(), old_index, @@ -1506,9 +1530,8 @@ pub(crate) async fn remap_vector_index( column.to_string(), ) .await?; + Ok(files) } - - Ok(()) } /// Open the Vector index on dataset, specified by the `uuid`. @@ -1822,7 +1845,7 @@ pub async fn initialize_vector_index( .open_frag_reuse_index(&NoOpMetricsCollector) .await?; - build_vector_index_incremental( + let summary = build_vector_index_incremental( target_dataset, column_name, &new_uuid.to_string(), @@ -1833,10 +1856,6 @@ pub async fn initialize_vector_index( ) .await?; - // Capture file sizes for the new vector index - let index_dir = target_dataset.indices_dir().join(new_uuid.to_string()); - let files = list_index_files_with_sizes(&target_dataset.object_store, &index_dir).await?; - let field = target_dataset.schema().field(column_name).ok_or_else(|| { Error::index(format!( "Column '{}' not found in target dataset", @@ -1856,7 +1875,7 @@ pub async fn initialize_vector_index( index_version: source_index.index_version, created_at: Some(chrono::Utc::now()), base_id: None, - files: Some(files), + files: Some(summary.files), }; let transaction = Transaction::new( diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 08d8c32a001..e141f8f2b1e 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -69,6 +69,7 @@ use lance_io::stream::RecordBatchStream; use lance_io::{object_store::ObjectStore, stream::RecordBatchStreamAdapter}; use lance_linalg::distance::{DistanceType, Dot, L2, Normalize}; use lance_linalg::kernels::normalize_fsl; +use lance_table::format::IndexFile; use log::info; use object_store::path::Path; use prost::Message; @@ -171,6 +172,11 @@ type BuildStream = type UnindexedStream = Box> + Send + Unpin + 'static>; +pub struct VectorIndexBuildSummary { + pub indices_merged: usize, + pub files: Vec, +} + impl IvfIndexBuilder { #[allow(clippy::too_many_arguments)] pub fn new( @@ -282,9 +288,8 @@ impl IvfIndexBuilder }) } - // build the index with the all data in the dataset, - // return the number of indices merged - pub async fn build(&mut self) -> Result { + // build the index and return the files created by the writer. + pub async fn build(&mut self) -> Result { let progress = self.progress.clone(); // step 1. train IVF & quantizer @@ -318,13 +323,16 @@ impl IvfIndexBuilder .stage_start("merge_partitions", num_partitions, "partitions") .await?; let build_idx_stream = self.build_partitions().boxed().await?; - self.merge_partitions(build_idx_stream).await?; + let files = self.merge_partitions(build_idx_stream).await?; progress.stage_complete("merge_partitions").await?; - Ok(self.merged_num) + Ok(VectorIndexBuildSummary { + indices_merged: self.merged_num, + files, + }) } - pub async fn remap(&mut self, mapping: &HashMap>) -> Result<()> { + pub async fn remap(&mut self, mapping: &HashMap>) -> Result> { if self.existing_indices.is_empty() { return Err(Error::invalid_input( "No existing indices available for remapping", @@ -359,13 +367,14 @@ impl IvfIndexBuilder } }); - self.merge_partitions( - stream::iter(build_iter) - .buffered(get_num_compute_intensive_cpus()) - .boxed(), - ) - .await?; - Ok(()) + let files = self + .merge_partitions( + stream::iter(build_iter) + .buffered(get_num_compute_intensive_cpus()) + .boxed(), + ) + .await?; + Ok(files) } pub fn with_ivf(&mut self, ivf: IvfModel) -> &mut Self { @@ -1108,7 +1117,10 @@ impl IvfIndexBuilder } #[instrument(name = "merge_partitions", level = "debug", skip_all)] - async fn merge_partitions(&mut self, mut build_stream: BuildStream) -> Result<()> { + async fn merge_partitions( + &mut self, + mut build_stream: BuildStream, + ) -> Result> { let Some(ivf) = self.ivf.as_ref() else { return Err(Error::invalid_input("IVF not set before merge partitions")); }; @@ -1347,12 +1359,21 @@ impl IvfIndexBuilder serde_json::to_string(&partition_index_metadata)?, ); - storage_writer.finish().await?; - index_writer.finish().await?; + let storage_summary = storage_writer.finish().await?; + let index_summary = index_writer.finish().await?; log::info!("merging {} partitions done", ivf.num_partitions()); - Ok(()) + Ok(vec![ + IndexFile { + path: INDEX_AUXILIARY_FILE_NAME.to_string(), + size_bytes: storage_summary.size_bytes, + }, + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: index_summary.size_bytes, + }, + ]) } // take raw vectors from the dataset diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 2f3cc4588fa..aa820da3fa3 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -99,7 +99,7 @@ use lance_io::{ }; use lance_linalg::distance::{DistanceType, Dot, L2, MetricType}; use lance_linalg::{distance::Normalize, kernels::normalize_fsl_owned}; -use lance_table::format::{IndexMetadata as TableIndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata as TableIndexMetadata}; use log::{info, warn}; use object_store::path::Path; use prost::Message; @@ -378,14 +378,14 @@ pub(crate) fn select_segment_for_single_rebalance( // TODO: move to `lance-index` crate. /// -/// Returns (new_uuid, num_indices_merged) +/// Returns (new_uuid, num_indices_merged, files) pub(crate) async fn optimize_vector_indices( dataset: Dataset, unindexed: Option, vector_column: &str, logical_index: &LogicalIvfView<'_>, options: &OptimizeOptions, -) -> Result<(Uuid, usize)> { +) -> Result<(Uuid, usize, Vec)> { let existing_indices = logical_index.indices().cloned().collect::>(); // Sanity check the indices if existing_indices.is_empty() { @@ -422,49 +422,51 @@ pub(crate) async fn optimize_vector_indices( "optimizing vector index: the first index isn't IVF".to_string(), ))?; - let merged = if let Some(pq_index) = first_idx.sub_index.as_any().downcast_ref::() { - optimize_ivf_pq_indices( - first_idx, - pq_index, - vector_column, - unindexed, - &existing_indices, - options, - writer, - dataset.version().version, - ) - .await? - } else if let Some(hnsw_sq) = first_idx - .sub_index - .as_any() - .downcast_ref::>() - { - let aux_file = dataset - .indices_dir() - .join(new_uuid.to_string()) - .join(INDEX_AUXILIARY_FILE_NAME); - let aux_writer = object_store.create(&aux_file).await?; - optimize_ivf_hnsw_indices( - Arc::new(dataset), - first_idx, - hnsw_sq, - vector_column, - unindexed, - &existing_indices, - options, - writer, - aux_writer, - ) - .await? - } else { - return Err(Error::index( - "optimizing vector index: the sub index isn't PQ or HNSW".to_string(), - )); - }; + let (merged, files) = + if let Some(pq_index) = first_idx.sub_index.as_any().downcast_ref::() { + let (merged, file) = optimize_ivf_pq_indices( + first_idx, + pq_index, + vector_column, + unindexed, + &existing_indices, + options, + writer, + dataset.version().version, + ) + .await?; + (merged, vec![file]) + } else if let Some(hnsw_sq) = first_idx + .sub_index + .as_any() + .downcast_ref::>() + { + let aux_file = dataset + .indices_dir() + .join(new_uuid.to_string()) + .join(INDEX_AUXILIARY_FILE_NAME); + let aux_writer = object_store.create(&aux_file).await?; + optimize_ivf_hnsw_indices( + Arc::new(dataset), + first_idx, + hnsw_sq, + vector_column, + unindexed, + &existing_indices, + options, + writer, + aux_writer, + ) + .await? + } else { + return Err(Error::index( + "optimizing vector index: the sub index isn't PQ or HNSW".to_string(), + )); + }; // never change the index version, // because we won't update the legacy vector index format - Ok((new_uuid, merged)) + Ok((new_uuid, merged, files)) } pub(crate) async fn optimize_vector_indices_v2( @@ -473,7 +475,7 @@ pub(crate) async fn optimize_vector_indices_v2( vector_column: &str, existing_indices: &[Arc], options: &OptimizeOptions, -) -> Result<(Uuid, usize)> { +) -> Result<(Uuid, usize, Vec)> { // Sanity check the indices if existing_indices.is_empty() { return Err(Error::index( @@ -498,7 +500,7 @@ pub(crate) async fn optimize_vector_indices_v2( let shuffler = create_ivf_shuffler(temp_dir_path, num_partitions, format_version, None); let (_, element_type) = get_vector_type(dataset.schema(), vector_column)?; - let merged_num = match index_type { + let summary = match index_type { // IVF_FLAT (SubIndexType::Flat, QuantizationType::Flat) => { if element_type == DataType::UInt8 { @@ -707,7 +709,7 @@ pub(crate) async fn optimize_vector_indices_v2( } }; - Ok((new_uuid, merged_num)) + Ok((new_uuid, summary.indices_merged, summary.files)) } #[allow(clippy::too_many_arguments)] @@ -720,7 +722,7 @@ async fn optimize_ivf_pq_indices( options: &OptimizeOptions, mut writer: Box, dataset_version: u64, -) -> Result { +) -> Result<(usize, IndexFile)> { let metric_type = first_idx.metric_type; let dim = first_idx.ivf.dimension(); @@ -787,9 +789,16 @@ async fn optimize_ivf_pq_indices( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(existing_indices.len() - start_pos) + Ok(( + existing_indices.len() - start_pos, + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }, + )) } #[allow(clippy::too_many_arguments)] @@ -803,7 +812,7 @@ async fn optimize_ivf_hnsw_indices( options: &OptimizeOptions, writer: Box, aux_writer: Box, -) -> Result { +) -> Result<(usize, Vec)> { let distance_type = first_idx.metric_type; let quantizer = hnsw_index.quantizer().clone(); let ivf = lance_index::vector::ivf::new_ivf_transformer_with_quantizer( @@ -940,13 +949,27 @@ async fn optimize_ivf_hnsw_indices( writer.add_metadata(IVF_PARTITION_KEY, &hnsw_metadata_json.to_string()); ivf_mut.write(&mut writer).await?; + let index_size = writer.tell().await? as u64; writer.finish().await?; // Write the aux file aux_ivf.write(&mut aux_writer).await?; + let aux_size = aux_writer.tell().await? as u64; aux_writer.finish().await?; - Ok(existing_indices.len() - start_pos) + Ok(( + existing_indices.len() - start_pos, + vec![ + IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: index_size, + }, + IndexFile { + path: INDEX_AUXILIARY_FILE_NAME.to_string(), + size_bytes: aux_size, + }, + ], + )) } #[derive(Serialize)] @@ -1596,7 +1619,7 @@ pub async fn build_ivf_pq_index( ivf_params: &IvfBuildParams, pq_params: &PQBuildParams, progress: std::sync::Arc, -) -> Result<()> { +) -> Result> { let (ivf_model, pq) = build_ivf_model_and_pq( dataset, column, @@ -1609,7 +1632,7 @@ pub async fn build_ivf_pq_index( let stream = scan_index_field_stream(dataset, column).await?; let precomputed_partitions = load_precomputed_partitions_if_available(ivf_params).await?; - write_ivf_pq_file( + let file = write_ivf_pq_file( dataset.object_store.as_ref(), dataset.indices_dir(), column, @@ -1625,7 +1648,8 @@ pub async fn build_ivf_pq_index( ivf_params.shuffle_partition_concurrency, ivf_params.precomputed_shuffle_buffers.clone(), ) - .await + .await?; + Ok(vec![file]) } #[allow(clippy::too_many_arguments)] @@ -1738,7 +1762,7 @@ pub(crate) async fn remap_index_file_v3( index: Arc, mapping: &HashMap>, column: String, -) -> Result<()> { +) -> Result> { let dataset = dataset.clone(); let index_dir = dataset.indices_dir().join(new_uuid); let (_, element_type) = get_vector_type(dataset.schema(), &column)?; @@ -1839,7 +1863,7 @@ pub(crate) async fn remap_index_file( name: String, column: String, transforms: Vec, -) -> Result<()> { +) -> Result { let object_store = dataset.object_store.as_ref(); let old_path = dataset.indices_dir().join(old_uuid).join(INDEX_FILE_NAME); let new_path = dataset.indices_dir().join(new_uuid).join(INDEX_FILE_NAME); @@ -1893,9 +1917,13 @@ pub(crate) async fn remap_index_file( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(()) + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }) } /// Write the index to the index file. @@ -1916,7 +1944,7 @@ async fn write_ivf_pq_file( shuffle_partition_batches: usize, shuffle_partition_concurrency: usize, precomputed_shuffle_buffers: Option<(Path, Vec)>, -) -> Result<()> { +) -> Result { let path = index_dir.clone().join(uuid).join(INDEX_FILE_NAME); let mut writer = object_store.create(&path).await?; @@ -1954,9 +1982,13 @@ async fn write_ivf_pq_file( // TODO: for now the IVF_PQ index file format hasn't been updated, so keep the old version, // change it to latest version value after refactoring the IVF_PQ writer.write_magics(pos, 0, 1, MAGIC).await?; + let size_bytes = writer.tell().await? as u64; Writer::shutdown(writer.as_mut()).await?; - Ok(()) + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes, + }) } pub async fn write_ivf_pq_file_from_existing_index( @@ -2168,7 +2200,7 @@ pub(crate) async fn merge_segments_with_progress( let index_version = infer_source_index_version(&segments)?; let segment_uuid = Uuid::new_v4(); let final_dir = indices_dir.clone().join(segment_uuid.to_string()); - merge_segments_to_dir( + let files = merge_segments_to_dir( object_store, indices_dir, &final_dir, @@ -2177,7 +2209,6 @@ pub(crate) async fn merge_segments_with_progress( progress, ) .await?; - let files = list_index_files_with_sizes(object_store, &final_dir).await?; merged_segment = TableIndexMetadata { uuid: segment_uuid, @@ -2204,7 +2235,7 @@ async fn merge_segments_to_dir( segments: &[TableIndexMetadata], _requested_index_type: Option, progress: Arc, -) -> Result<()> { +) -> Result> { reset_final_segment_dir(object_store, final_dir).await?; debug_assert!( @@ -2231,14 +2262,15 @@ async fn merge_segments_to_dir( }) .collect::>(); - lance_index::vector::distributed::index_merger::merge_partial_vector_auxiliary_files( - object_store, - &aux_paths, - final_dir, - progress.clone(), - ) - .await?; - write_root_vector_index_from_auxiliary( + let auxiliary_file = + lance_index::vector::distributed::index_merger::merge_partial_vector_auxiliary_files( + object_store, + &aux_paths, + final_dir, + progress.clone(), + ) + .await?; + let index_file = write_root_vector_index_from_auxiliary( object_store, final_dir, None, @@ -2247,7 +2279,7 @@ async fn merge_segments_to_dir( ) .await?; - Ok(()) + Ok(vec![auxiliary_file, index_file]) } fn infer_source_index_version(group: &[TableIndexMetadata]) -> Result { @@ -2277,7 +2309,7 @@ async fn write_root_vector_index_from_auxiliary( requested_index_type: Option, centroid_source_index_paths: &[Path], progress: Arc, -) -> Result<()> { +) -> Result { let aux_path = index_dir.clone().join(INDEX_AUXILIARY_FILE_NAME); let scheduler = ScanScheduler::new( Arc::new(object_store.clone()), @@ -2418,11 +2450,14 @@ async fn write_root_vector_index_from_auxiliary( let empty_batch = RecordBatch::new_empty(arrow_schema); v2_writer.write_batch(&empty_batch).await?; - v2_writer.finish().await?; + let summary = v2_writer.finish().await?; progress.stage_progress("write_root_index", 1).await?; progress.stage_complete("write_root_index").await?; - Ok(()) + Ok(IndexFile { + path: INDEX_FILE_NAME.to_string(), + size_bytes: summary.size_bytes, + }) } async fn do_train_ivf_model( diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index b47b00d409c..510b0cc45a6 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -1723,6 +1723,7 @@ mod tests { use lance_index::IndexType; use lance_index::progress::IndexBuildProgress; use lance_index::vector::DIST_COL; + use lance_index::vector::hnsw::builder::HnswBuildParams; use lance_index::vector::ivf::IvfBuildParams; use lance_index::vector::kmeans::{KMeansParams, train_kmeans}; use lance_index::vector::pq::PQBuildParams; @@ -1735,7 +1736,6 @@ mod tests { }; use lance_index::{INDEX_AUXILIARY_FILE_NAME, metrics::NoOpMetricsCollector}; use lance_index::{optimize::OptimizeOptions, scalar::IndexReader}; - use lance_index::{scalar::IndexWriter, vector::hnsw::builder::HnswBuildParams}; use lance_io::{ object_store::ObjectStore, scheduler::{ScanScheduler, SchedulerConfig}, @@ -4739,7 +4739,10 @@ mod tests { STORAGE_METADATA_KEY.to_owned(), serde_json::to_string(&vec![pq_metadata])?, ); - writer.finish_with_metadata(metadata).await?; + for (key, value) in metadata { + writer.add_schema_metadata(key, value); + } + writer.finish().await?; // Build new IndexMetadata with the new UUID and file sizes. let new_files = From c186e419863ed3d51faa0d62e6b4480a7b0333f0 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Mon, 8 Jun 2026 07:19:40 -0700 Subject: [PATCH 050/177] perf: reuse WAND lead buffer capacity in push_back_leads (#7153) Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar/inverted/wand.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 6d607670aa9..609ec08041f 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -1368,8 +1368,8 @@ impl<'a, S: Scorer> Wand<'a, S> { // After finishing a candidate doc, convert the aligned iterators back // into lagging iterators. Entries that do not stay in `tail` are // advanced to `target` and returned to `head`. - let leads = std::mem::take(&mut self.lead); - for posting in leads { + // pop() drains in place, keeping self.lead's capacity for reuse. + while let Some(posting) = self.lead.pop() { let upper_bound = posting.approximate_upper_bound(); if let Some(mut evicted) = self.insert_tail_with_overflow(posting, upper_bound) { evicted.next(target); From 0fa8ae6e3d5b4e4e008ec0a855c1b5a0abd2937d Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Mon, 8 Jun 2026 08:21:13 -0700 Subject: [PATCH 051/177] fix: remove empty index segment when committing real segments (#7141) A 0-fragment index segment is disjoint from every incoming fragment set, so commit_existing_index_segments never removed it and it persisted next to the real segments. A leftover empty segment then breaks load_segment_details, whose index details can disagree with the real segments. Remove an empty existing segment once any real segment is committed. Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance/src/index.rs | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index c973fe1ad95..c628dbe5919 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1220,6 +1220,13 @@ impl DatasetIndexExt for Dataset { return Ok(Some(idx)); }; + // A zero-fragment segment can be used to create an index while + // deferring the actual build. Such a segment is disjoint from every + // other segment but should still be removed. + if existing_fragments.is_empty() { + return Ok(Some(idx)); + } + if existing_fragments.is_disjoint(&incoming_fragments) { return Ok(None); } @@ -6808,6 +6815,69 @@ mod tests { assert!(err.to_string().contains("would orphan fragments")); } + #[tokio::test] + async fn test_commit_existing_index_segments_removes_empty_segment() { + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, test_dir.path().to_str().unwrap(), None) + .await + .unwrap(); + let field_id = dataset.schema().field("vector").unwrap().id; + let uuid = Uuid::new_v4(); + + // Commit a 0-fragment segment, then a real segment covering the dataset. + let empty = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + uuid, + std::iter::empty::(), + b"empty", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&empty)], + ) + .await + .unwrap(); + let seg = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg)], + ) + .await + .unwrap(); + + // The real segment covers the dataset, so the redundant empty one is removed. + let committed = dataset.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + committed.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg.uuid]), + "empty segment should be removed once a real segment covers the dataset", + ); + } + #[tokio::test] async fn test_resolve_index_column_error_cases() { use lance_datagen::{BatchCount, RowCount, array}; From 8a30c5778b2408d3bd8ae33181d19f9d170a02ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D0=BA=20=D0=91=D1=83=D1=85=D0=BD=D0=B5?= =?UTF-8?q?=D1=80?= <66881554+Alowator@users.noreply.github.com> Date: Mon, 8 Jun 2026 22:21:38 +0700 Subject: [PATCH 052/177] docs: correct alter_columns nullable docstring (#7095) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary The `LanceDataset.alter_columns` docstring states that a nullable column **cannot** be made non-nullable: > Only non-nullable columns can be changed to nullable. Currently, you cannot change a nullable column to non-nullable. This is out of date. Support for changing a nullable column to non-nullable was added in #5589 (`validate_no_nulls_before_making_non_nullable` in `rust/lance/src/dataset/schema_evolution.rs`), and the Python binding already plumbs `nullable` through (`python/src/dataset.rs`). The only requirement is that the column contains no NULL values, otherwise an error is raised. This PR updates the docstring to describe the actual behavior in both directions. ## Changes - Reword the `"nullable"` parameter docs for `alter_columns` to reflect that a nullable column can be made non-nullable when it has no NULL values. ## Notes Docs-only change; no functional change and no test impact. Lint was deferred to CI for this docstring-only edit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 --- python/python/lance/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index c8132f83ba4..ad5a03a7ccd 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2283,9 +2283,9 @@ def alter_columns(self, *alterations: Iterable[AlterColumn]): not changed. - "nullable": bool, optional Whether the column should be nullable. If not specified, the column - nullability is not changed. Only non-nullable columns can be changed - to nullable. Currently, you cannot change a nullable column to - non-nullable. + nullability is not changed. A non-nullable column can always be made + nullable. A nullable column can be made non-nullable only if it + contains no NULL values; otherwise an error is raised. - "data_type": pyarrow.DataType, optional The new data type to cast the column to. If not specified, the column data type is not changed. From a266f2bc23584b37b8fc03004502d2f089df0fe6 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Mon, 8 Jun 2026 16:21:38 +0000 Subject: [PATCH 053/177] chore: release beta version 8.0.0-beta.7 --- .bumpversion.toml | 2 +- Cargo.lock | 214 +++++++++++++++++++------------------- Cargo.toml | 42 ++++---- java/lance-jni/Cargo.lock | 166 ++++++++++++++--------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 188 ++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 309 insertions(+), 309 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index f237926b6a6..0973d1ae950 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.6" +current_version = "8.0.0-beta.7" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index d9d7588827e..603091c2f5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,7 +386,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "serde_core", "serde_json", ] @@ -540,7 +540,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -602,7 +602,7 @@ dependencies = [ "bytes-utils", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "percent-encoding", @@ -630,7 +630,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -660,7 +660,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "lru", "percent-encoding", @@ -689,7 +689,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -713,7 +713,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -738,7 +738,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -759,7 +759,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -788,7 +788,7 @@ dependencies = [ "bytes", "crc-fast", "hex", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "md-5 0.10.6", @@ -822,7 +822,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -841,7 +841,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -898,7 +898,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -918,7 +918,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -936,7 +936,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -984,7 +984,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -1017,7 +1017,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -1119,9 +1119,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -3086,7 +3086,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "rustc_version", ] @@ -3166,7 +3166,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3561,7 +3561,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3670,7 +3670,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" dependencies = [ "dirs", "futures", - "http 1.4.1", + "http 1.4.2", "indicatif", "libc", "log", @@ -3694,7 +3694,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3750,9 +3750,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3776,7 +3776,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3787,7 +3787,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3830,7 +3830,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3847,7 +3847,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3897,7 +3897,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -4224,7 +4224,7 @@ version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "libc", ] @@ -4479,7 +4479,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "all_asserts", "approx", @@ -4583,7 +4583,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4631,7 +4631,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrayref", "paste", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4679,7 +4679,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4732,7 +4732,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4777,7 +4777,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "all_asserts", "arrow", @@ -4803,7 +4803,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4843,7 +4843,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "datafusion", "geo-traits", @@ -4857,7 +4857,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "approx", "arc-swap", @@ -4934,7 +4934,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-arith", @@ -4954,7 +4954,7 @@ dependencies = [ "criterion", "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4983,7 +4983,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "approx", "arrow-array", @@ -5003,7 +5003,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "async-trait", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-schema", @@ -5031,7 +5031,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -5090,7 +5090,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -5109,7 +5109,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -5156,7 +5156,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "proc-macro2", "quote", @@ -5165,7 +5165,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-schema", @@ -5178,7 +5178,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5190,7 +5190,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "clap", "lance-core", @@ -5359,9 +5359,9 @@ dependencies = [ [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -5596,9 +5596,9 @@ dependencies = [ [[package]] name = "mock_instant" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" +checksum = "9bb517913cfcfb9eeda59f36020269075a152701a01606c612f547e4890be399" [[package]] name = "mockall" @@ -5905,7 +5905,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -5950,7 +5950,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -6017,7 +6017,7 @@ version = "6.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "libc", "once_cell", "onig_sys", @@ -6072,7 +6072,7 @@ dependencies = [ "base64 0.22.1", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -6097,7 +6097,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -6141,7 +6141,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64 0.22.1", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -6162,7 +6162,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64 0.22.1", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -6180,7 +6180,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -6191,7 +6191,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -6209,7 +6209,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -6244,7 +6244,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -6260,7 +6260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -6279,7 +6279,7 @@ dependencies = [ "base64 0.22.1", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -6298,7 +6298,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "opendal-core", "quick-xml 0.39.4", "reqsign-core", @@ -6314,7 +6314,7 @@ version = "0.10.80" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "foreign-types", "libc", @@ -6879,7 +6879,7 @@ checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set", "bit-vec", - "bitflags 2.12.1", + "bitflags 2.13.0", "num-traits", "rand 0.9.4", "rand_chacha 0.9.0", @@ -6892,9 +6892,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -6902,9 +6902,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -6921,9 +6921,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", @@ -6934,9 +6934,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -7281,7 +7281,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -7383,7 +7383,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -7402,7 +7402,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -7424,7 +7424,7 @@ dependencies = [ "base64 0.22.1", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -7448,7 +7448,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -7478,7 +7478,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -7496,7 +7496,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -7511,7 +7511,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -7529,7 +7529,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -7576,7 +7576,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -7614,7 +7614,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -7797,7 +7797,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "errno", "libc", "linux-raw-sys", @@ -8015,7 +8015,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -8706,7 +8706,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -9086,7 +9086,7 @@ dependencies = [ "base64 0.22.1", "bytes", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -9140,9 +9140,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -9158,11 +9158,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "async-compression", - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -9728,7 +9728,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "hashbrown 0.15.5", "indexmap 2.14.0", "semver", @@ -10130,7 +10130,7 @@ dependencies = [ "base64 0.22.1", "deadpool", "futures", - "http 1.4.1", + "http 1.4.2", "http-body-util", "hyper", "hyper-util", @@ -10207,7 +10207,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags 2.12.1", + "bitflags 2.13.0", "indexmap 2.14.0", "log", "serde", @@ -10290,7 +10290,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -10364,7 +10364,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", diff --git a/Cargo.toml b/Cargo.toml index 6a267348167..e5e834020cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -56,26 +56,26 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.6", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.6", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.6", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.6", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.6", path = "./rust/lance-datagen" } -lance-encoding = { version = "=8.0.0-beta.6", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.6", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.6", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.6", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.6", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.6", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.6", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.6", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.7", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.7", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.7", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.7", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.7", path = "./rust/lance-datagen" } +lance-encoding = { version = "=8.0.0-beta.7", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.7", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.7", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.7", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.7", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.7", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.7", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.7", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.6", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.6", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.6", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.6", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.6", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.7", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.7", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.7", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.7", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.7", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -102,7 +102,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.6", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.7", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -142,7 +142,7 @@ deepsize = "0.2.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.6", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.7", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index fa08fd758aa..8860080ca44 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -491,7 +491,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -551,7 +551,7 @@ dependencies = [ "bytes", "bytes-utils", "fastrand", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "percent-encoding", "pin-project-lite", @@ -578,7 +578,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -602,7 +602,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -627,7 +627,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -647,7 +647,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -677,7 +677,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -696,7 +696,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -753,7 +753,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -773,7 +773,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -790,7 +790,7 @@ dependencies = [ "bytes", "bytes-utils", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -836,7 +836,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -869,7 +869,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -929,9 +929,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -2569,7 +2569,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2950,7 +2950,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3059,7 +3059,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3115,9 +3115,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3141,7 +3141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3152,7 +3152,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3195,7 +3195,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3212,7 +3212,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3246,7 +3246,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -3770,7 +3770,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arc-swap", "arrow", @@ -3844,7 +3844,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -3886,7 +3886,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrayref", "paste", @@ -3895,7 +3895,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -3910,6 +3910,7 @@ dependencies = [ "itertools 0.13.0", "lance-arrow", "libc", + "libm", "log", "moka", "num_cpus", @@ -3925,12 +3926,13 @@ dependencies = [ "tokio-stream", "tokio-util", "tracing", + "twox-hash", "url", ] [[package]] name = "lance-datafusion" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -3962,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -3980,7 +3982,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4015,7 +4017,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4046,7 +4048,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "datafusion", "geo-traits", @@ -4060,7 +4062,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arc-swap", "arrow", @@ -4104,7 +4106,6 @@ dependencies = [ "lance-select", "lance-table", "lance-tokenizer", - "libm", "libsais-rs", "log", "ndarray", @@ -4124,13 +4125,12 @@ dependencies = [ "tempfile", "tokio", "tracing", - "twox-hash", "uuid", ] [[package]] name = "lance-io" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-arith", @@ -4149,7 +4149,7 @@ dependencies = [ "chrono", "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4172,7 +4172,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4224,7 +4224,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "async-trait", @@ -4236,7 +4236,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-ipc", @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4334,7 +4334,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -4446,9 +4446,9 @@ dependencies = [ [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -4849,7 +4849,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -4943,7 +4943,7 @@ dependencies = [ "base64", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -4968,7 +4968,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -5012,7 +5012,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5033,7 +5033,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5051,7 +5051,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -5062,7 +5062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5080,7 +5080,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5115,7 +5115,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5131,7 +5131,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5150,7 +5150,7 @@ dependencies = [ "base64", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -5169,7 +5169,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "opendal-core", "quick-xml 0.39.4", "reqsign-core", @@ -5572,9 +5572,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -5582,9 +5582,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -5601,9 +5601,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", @@ -5614,9 +5614,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -5965,7 +5965,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -5984,7 +5984,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -6006,7 +6006,7 @@ dependencies = [ "base64", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -6030,7 +6030,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -6060,7 +6060,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -6078,7 +6078,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6093,7 +6093,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6111,7 +6111,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6155,7 +6155,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6193,7 +6193,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -7350,7 +7350,7 @@ dependencies = [ "base64", "bytes", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -7406,7 +7406,7 @@ checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "bitflags", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -7426,7 +7426,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -8486,7 +8486,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -8560,7 +8560,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 5eaa69f071b..606ff756676 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index c47523939a5..592b45634f6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.6 + 8.0.0-beta.7 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 7867ea71446..e2df3d18af5 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -417,7 +417,7 @@ version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "serde_core", "serde_json", ] @@ -582,7 +582,7 @@ dependencies = [ "bytes", "fastrand", "hex", - "http 1.4.1", + "http 1.4.2", "ring", "time", "tokio", @@ -642,7 +642,7 @@ dependencies = [ "bytes", "bytes-utils", "fastrand", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "percent-encoding", "pin-project-lite", @@ -669,7 +669,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -693,7 +693,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -717,7 +717,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -742,7 +742,7 @@ dependencies = [ "aws-types", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "regex-lite", "tracing", ] @@ -762,7 +762,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "percent-encoding", "sha2 0.10.9", "time", @@ -792,7 +792,7 @@ dependencies = [ "bytes-utils", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "percent-encoding", @@ -811,7 +811,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "h2", - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-rustls", "hyper-util", @@ -868,7 +868,7 @@ dependencies = [ "bytes", "fastrand", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -888,7 +888,7 @@ dependencies = [ "aws-smithy-types", "bytes", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "pin-project-lite", "tokio", "tracing", @@ -906,7 +906,7 @@ dependencies = [ "bytes-utils", "futures-core", "http 0.2.12", - "http 1.4.1", + "http 1.4.2", "http-body 0.4.6", "http-body 1.0.1", "http-body-util", @@ -954,7 +954,7 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -987,7 +987,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "mime", @@ -1053,9 +1053,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.12.1" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84d7ced0ae9557296835c32bf1b1e02b44c746701f898460fb000d7eaa84f00a" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" [[package]] name = "bitpacking" @@ -2863,7 +2863,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "rustc_version", ] @@ -2919,7 +2919,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3309,7 +3309,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.1", + "http 1.4.2", "indexmap 2.14.0", "slab", "tokio", @@ -3418,7 +3418,7 @@ checksum = "430b33fa84f92796d4d263070b6c0d3ca219df7b9a0e1853ee431029b1612bcd" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "more-asserts", "serde", "thiserror 2.0.18", @@ -3474,9 +3474,9 @@ dependencies = [ [[package]] name = "http" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be7462df143984c4598a256ef469b251d7d7f9e271135073e78fc535414f3d0" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" dependencies = [ "bytes", "itoa", @@ -3500,7 +3500,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", ] [[package]] @@ -3511,7 +3511,7 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "pin-project-lite", ] @@ -3554,7 +3554,7 @@ dependencies = [ "futures-channel", "futures-core", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "httparse", "httpdate", @@ -3571,7 +3571,7 @@ version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http 1.4.1", + "http 1.4.2", "hyper", "hyper-util", "rustls", @@ -3605,7 +3605,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "hyper", "ipnet", @@ -3901,7 +3901,7 @@ version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "cfg-if 1.0.4", "libc", ] @@ -4136,7 +4136,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arc-swap", "arrow", @@ -4211,7 +4211,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4253,7 +4253,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrayref", "paste", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4299,7 +4299,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4331,7 +4331,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4349,7 +4349,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4384,7 +4384,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-arith", "arrow-array", @@ -4415,7 +4415,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "datafusion", "geo-traits", @@ -4429,7 +4429,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arc-swap", "arrow", @@ -4498,7 +4498,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-arith", @@ -4517,7 +4517,7 @@ dependencies = [ "chrono", "deepsize", "futures", - "http 1.4.1", + "http 1.4.2", "io-uring", "lance-arrow", "lance-core", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4556,7 +4556,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "async-trait", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-ipc", @@ -4612,7 +4612,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow-array", "arrow-buffer", @@ -4628,7 +4628,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -4668,7 +4668,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "icu_segmenter", "jieba-rs", @@ -4872,9 +4872,9 @@ dependencies = [ [[package]] name = "link-section" -version = "0.18.1" +version = "0.18.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014e440054ce8170890229eeef5bcda955305e056ec713de40ed366944483f09" +checksum = "c2b1dd6fe32e55c0fc0ea9493aa57459ca3cf4ff3c857c7d0302290150da6e4f" [[package]] name = "linktime-proc-macro" @@ -5268,7 +5268,7 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -5313,7 +5313,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body-util", "httparse", "humantime", @@ -5407,7 +5407,7 @@ dependencies = [ "base64", "bytes", "futures", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "jiff", "log", @@ -5432,7 +5432,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", - "http 1.4.1", + "http 1.4.2", "mea", "opendal-core", ] @@ -5476,7 +5476,7 @@ checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5497,7 +5497,7 @@ checksum = "6dea4908d490143a9b0b7f7a790e139ff829b06a023f670455ed3d44f664b361" dependencies = [ "base64", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "opendal-service-azure-common", @@ -5515,7 +5515,7 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ - "http 1.4.1", + "http 1.4.2", "opendal-core", ] @@ -5526,7 +5526,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa8cafe9729213375c7331019b0cb756ad3e1aff7f45cd32c45eae91ebde8901" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5544,7 +5544,7 @@ checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5579,7 +5579,7 @@ checksum = "c4922661976a1d40794a2adfbdb888cc3c23097690f825a92f773af38908a848" dependencies = [ "bytes", "hf-xet", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "percent-encoding", @@ -5595,7 +5595,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "log", "opendal-core", "quick-xml 0.39.4", @@ -5614,7 +5614,7 @@ dependencies = [ "base64", "bytes", "crc32c", - "http 1.4.1", + "http 1.4.2", "log", "md-5 0.11.0", "opendal-core", @@ -5633,7 +5633,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f2f7a4c32e5202eb4ac72e76c4b5e30c86ab60762811172f4111103b9d673a1" dependencies = [ "bytes", - "http 1.4.1", + "http 1.4.2", "opendal-core", "quick-xml 0.39.4", "reqsign-core", @@ -6075,9 +6075,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +checksum = "528ac67416ff8646872a3c02cad9cc4ee5dc9f9540c9b10771855c95cb2e5ae1" dependencies = [ "bytes", "prost-derive", @@ -6085,9 +6085,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +checksum = "03da047801ff44bb6a4d407d4860c05fd70bb81714e6b2f3812603d5b145b042" dependencies = [ "heck", "itertools 0.14.0", @@ -6104,9 +6104,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +checksum = "b570b25f7617e43d59005d0990ccb79e950a423952cea19671b7a876da390adf" dependencies = [ "anyhow", "itertools 0.14.0", @@ -6117,9 +6117,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.14.3" +version = "0.14.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +checksum = "f94967dc7688f3054c7fac87473ffae4cc4c3904800e2d9f5b857246d8963b0a" dependencies = [ "prost", ] @@ -6156,7 +6156,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" dependencies = [ "arrow", "arrow-array", @@ -6558,7 +6558,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", ] [[package]] @@ -6663,7 +6663,7 @@ checksum = "372266b4733756738eeb199a98188037d27a0989980e2600ae7ce1faf00a867d" dependencies = [ "anyhow", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6682,7 +6682,7 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "quick-xml 0.40.1", @@ -6704,7 +6704,7 @@ dependencies = [ "base64", "bytes", "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "pem", "percent-encoding", @@ -6728,7 +6728,7 @@ dependencies = [ "futures", "hex", "hmac 0.13.0", - "http 1.4.1", + "http 1.4.2", "jiff", "log", "percent-encoding", @@ -6758,7 +6758,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb215d0876a18b6bd9cdd380b589e5292aaa638ca15266de794b1122d898b6b2" dependencies = [ "form_urlencoded", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-aws-v4", @@ -6776,7 +6776,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84110aabba799fbcd48b3abb51fbbff4749f879252e5806b6f5d0cbe0fef6abb" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6791,7 +6791,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91d083a363b3577f519ce8425bb50f902622a28a83f7c4a26a5c990b66ec75b3" dependencies = [ "anyhow", - "http 1.4.1", + "http 1.4.2", "log", "percent-encoding", "reqsign-core", @@ -6809,7 +6809,7 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6853,7 +6853,7 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -6891,7 +6891,7 @@ checksum = "07bc3f1384cffa4f274dad2d4ddd73aed32fed8f786d96c6be8aa4e5fd3c3b58" dependencies = [ "anyhow", "async-trait", - "http 1.4.1", + "http 1.4.2", "reqwest 0.13.4", "thiserror 2.0.18", "tower-service", @@ -7030,7 +7030,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "errno", "libc", "linux-raw-sys", @@ -7235,7 +7235,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -7837,7 +7837,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -8132,7 +8132,7 @@ dependencies = [ "base64", "bytes", "h2", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "hyper", @@ -8186,9 +8186,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -8204,11 +8204,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cfcf7e2740e6fc6d4d688b4ef00650406bb94adf4731e43c096c3a19fe40840" dependencies = [ "async-compression", - "bitflags 2.12.1", + "bitflags 2.13.0", "bytes", "futures-core", "futures-util", - "http 1.4.1", + "http 1.4.2", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -8719,7 +8719,7 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags 2.12.1", + "bitflags 2.13.0", "hashbrown 0.15.5", "indexmap 2.14.0", "semver", @@ -9157,7 +9157,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags 2.12.1", + "bitflags 2.13.0", "indexmap 2.14.0", "log", "serde", @@ -9240,7 +9240,7 @@ dependencies = [ "clap", "crc32fast", "futures", - "http 1.4.1", + "http 1.4.2", "hyper", "lazy_static", "more-asserts", @@ -9314,7 +9314,7 @@ dependencies = [ "chrono", "clap", "gearhash", - "http 1.4.1", + "http 1.4.2", "itertools 0.14.0", "lazy_static", "more-asserts", diff --git a/python/Cargo.toml b/python/Cargo.toml index 9c7800d3c83..7266a699d0c 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.6" +version = "8.0.0-beta.7" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 53f1ac675feee9bc961db55d91fb594342ecc64a Mon Sep 17 00:00:00 2001 From: Brendan Clement Date: Mon, 8 Jun 2026 09:23:06 -0700 Subject: [PATCH 054/177] refactor(io/exec): address #6799 review (drop helper, inline RAII timer) (#6907) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Addresses Weston's review comments on #6799 - Revert: drop setup-time Instant::now() brackets that captured I/O time. I wrapped setup `.await`s in `MapIndexExec`, `FlatMatchFilterExec`, and `FlatMatchQueryExec` with `Instant::now() to add_duration(start.elapsed())` to attribute one-shot setup work to `elapsed_compute`. Weston pointed out those brackets capture I/O wait, not CPU. - Replace InstrumentedChildInputStream with inline RAII timer in .then() bodies, Deletes the `InstrumentedChildInputStream` helper struct. - Extend flat_bm25 scoring timer to pre- and post-await sync work In flat_bm25_search_stream_with_metrics the single scoring_start region only covered initialize_scorer + flat_bm25_score. ### Testing Benchmarked (1M-row synthetic dataset with BTREE + INVERTED + IVF_PQ indexes, identical analyze plan queries on each side). Comparing against the commit prior to this work | Scenario | Node | Before | After | Notes | |---|---|---|---|---| | KNN + BTREE scalar prefilter | KNNVectorDistance (520 rows) | 1.39 ms | 112 µs | ~12× smaller — the original double-count fix preserved | | KNN + FTS post-filter | FlatMatchFilter (0 rows) | 10.95 ms | 42 µs | ~262× smaller | | KNN + FTS prefilter | FlatMatchQuery (2.52 K rows) | 848 µs | 79.08 ms | ~93× larger; spawn_cpu tokenize CPU now correctly attributed | | KNN + FTS prefilter (control) | MatchQuery (untouched code) | 3.80 ms | 4.63 ms | noise only | --------- Co-authored-by: Xuanwo --- rust/lance-index/src/scalar/inverted/index.rs | 16 +- rust/lance/src/io/exec/fts.rs | 93 +++---- rust/lance/src/io/exec/knn.rs | 51 ++-- rust/lance/src/io/exec/rowids.rs | 64 +++-- rust/lance/src/io/exec/scalar_index.rs | 57 ++-- rust/lance/src/io/exec/utils.rs | 260 +----------------- 6 files changed, 144 insertions(+), 397 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 1ce17f53244..e7ebfec1e82 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -4955,6 +4955,9 @@ pub async fn flat_bm25_search_stream_with_metrics( elapsed_compute: Option

This setting currently only affects RQ-quantized vector indexes, such as IVF_RQ. Other index + * types ignore this setting. + */ +public enum ApproxMode { + /** Prefer faster approximate scoring when supported by the RQ index. */ + FAST("fast"), + + /** Use the index's default approximation behavior. */ + NORMAL("normal"), + + /** Prefer more accurate approximate scoring when supported by the RQ index. */ + ACCURATE("accurate"); + + private final String value; + + ApproxMode(String value) { + this.value = value; + } + + /** Returns the lowercase value passed across the JNI boundary. */ + public String toRustString() { + return value; + } +} diff --git a/java/src/main/java/org/lance/ipc/Query.java b/java/src/main/java/org/lance/ipc/Query.java index 48013b375ee..215865310df 100644 --- a/java/src/main/java/org/lance/ipc/Query.java +++ b/java/src/main/java/org/lance/ipc/Query.java @@ -32,6 +32,7 @@ public class Query { private final Optional distanceType; private final boolean useIndex; private final int queryParallelism; + private final ApproxMode approxMode; private Query(Builder builder) { this.column = Preconditions.checkNotNull(builder.column, "Columns must be set"); @@ -52,6 +53,7 @@ private Query(Builder builder) { this.distanceType = builder.distanceType; this.useIndex = builder.useIndex; this.queryParallelism = builder.queryParallelism; + this.approxMode = builder.approxMode; } public String getColumn() { @@ -98,6 +100,14 @@ public int getQueryParallelism() { return queryParallelism; } + public ApproxMode getApproxMode() { + return approxMode; + } + + public String getApproxModeString() { + return approxMode.toRustString(); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -111,6 +121,7 @@ public String toString() { .add("distanceType", distanceType.orElse(null)) .add("useIndex", useIndex) .add("queryParallelism", queryParallelism) + .add("approxMode", approxMode) .toString(); } @@ -125,6 +136,7 @@ public static class Builder { private Optional distanceType = Optional.empty(); private boolean useIndex = true; private int queryParallelism = 0; + private ApproxMode approxMode = ApproxMode.NORMAL; /** * Sets the column to be searched. @@ -275,6 +287,20 @@ public Builder setQueryParallelism(int queryParallelism) { return this; } + /** + * Sets the speed / accuracy tradeoff for approximate vector search. + * + *

This setting currently only affects RQ-quantized vector indexes, such as IVF_RQ. Other + * index types ignore this setting. + * + * @param approxMode The approximate search mode to use for the query. + * @return The Builder instance for method chaining. + */ + public Builder setApproxMode(ApproxMode approxMode) { + this.approxMode = Preconditions.checkNotNull(approxMode, "ApproxMode must not be null"); + return this; + } + /** * Builds the Query object. * diff --git a/java/src/test/java/org/lance/JNITest.java b/java/src/test/java/org/lance/JNITest.java index c0e5f900edc..daa123b3200 100644 --- a/java/src/test/java/org/lance/JNITest.java +++ b/java/src/test/java/org/lance/JNITest.java @@ -20,6 +20,7 @@ import org.lance.index.vector.PQBuildParams; import org.lance.index.vector.SQBuildParams; import org.lance.index.vector.VectorIndexParams; +import org.lance.ipc.ApproxMode; import org.lance.ipc.Query; import org.lance.test.JniTestHelper; @@ -28,6 +29,7 @@ import java.util.Arrays; import java.util.Optional; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class JNITest { @@ -48,6 +50,10 @@ public void testIntsOpt() { @Test public void testQuery() { + Query defaultQuery = + new Query.Builder().setColumn("column").setKey(new float[] {1.0f, 2.0f, 3.0f}).build(); + assertEquals(ApproxMode.NORMAL, defaultQuery.getApproxMode()); + JniTestHelper.parseQuery( Optional.of( new Query.Builder() @@ -60,6 +66,7 @@ public void testQuery() { .setDistanceType(DistanceType.L2) .setUseIndex(true) .setQueryParallelism(-1) + .setApproxMode(ApproxMode.ACCURATE) .build())); } diff --git a/protos/ann.proto b/protos/ann.proto index c9d3b4dcc2f..f5de5e25e7b 100644 --- a/protos/ann.proto +++ b/protos/ann.proto @@ -9,6 +9,20 @@ import "table_identifier.proto"; import "table.proto"; import "index.proto"; +// Query-time approximation mode for vector search. +// +// This currently only affects RQ-quantized vector indexes, such as IVF_RQ. +// Other index types ignore this setting. +enum VectorApproxMode { + // Use all RQ bits for query-time scoring with u8-quantized lookup tables. + Normal = 0; + // Use only one RQ bit for query-time scoring, even for multi-bit indexes. + Fast = 1; + // Use all RQ bits for query-time scoring with u16-quantized lookup tables + // to reduce estimator quantization error. + Accurate = 2; +} + // Serialized vector query parameters. message VectorQueryProto { // Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.) @@ -26,6 +40,9 @@ message VectorQueryProto { bool use_index = 11; optional float dist_q_c = 12; optional int32 query_parallelism = 13; + // Query-time approximation mode. Currently only affects RQ-quantized vector + // indexes, such as IVF_RQ. Other index types ignore this setting. + VectorApproxMode approx_mode = 14; } // Serializable form of ANNIvfSubIndexExec — the IVF sub-index search node. diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 703b71ae8b4..831e194f9f3 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -6252,6 +6252,7 @@ def nearest( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", distance_range: Optional[tuple[Optional[float], Optional[float]]] = None, ) -> ScannerBuilder: """Configure nearest neighbor search. @@ -6275,6 +6276,13 @@ def nearest( the CPU pool size. Value 1 uses the single-worker sequential path. Values >= 2 use the partition-parallel path and are clamped to the CPU pool size. + approx_mode: {"fast", "normal", "accurate"}, default "normal" + Controls the speed / accuracy tradeoff for approximate vector search + when supported by the selected index. This currently only affects + RQ-quantized indexes, such as IVF_RQ. Other index types ignore this + setting. ``fast`` favors lower latency and may reduce recall, + ``normal`` uses the default balance, and ``accurate`` favors higher + recall and may increase latency. """ self._nearest = _build_vector_search_query( column, @@ -6289,6 +6297,7 @@ def nearest( use_index=use_index, ef=ef, query_parallelism=query_parallelism, + approx_mode=approx_mode, distance_range=distance_range, ) return self @@ -7411,6 +7420,7 @@ def _build_vector_search_query( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", distance_range: Optional[tuple[Optional[float], Optional[float]]] = None, ) -> dict: """Configure nearest neighbor search. @@ -7452,6 +7462,13 @@ def _build_vector_search_query( maps to the single-worker sequential path. Value -1 uses the CPU pool size. Value 1 uses the single-worker sequential path. Values >= 2 use the partition-parallel path and are clamped to the CPU pool size. + approx_mode: {"fast", "normal", "accurate"}, default "normal" + Controls the speed / accuracy tradeoff for approximate vector search + when supported by the selected index. This currently only affects + RQ-quantized indexes, such as IVF_RQ. Other index types ignore this + setting. ``fast`` favors lower latency and may reduce recall, + ``normal`` uses the default balance, and ``accurate`` favors higher + recall and may increase latency. distance_range: tuple[Optional[float], Optional[float]], optional A tuple of (lower_bound, upper_bound) to filter results by distance. Both bounds are optional. The lower bound is inclusive and the upper @@ -7525,6 +7542,12 @@ def _build_vector_search_query( if query_parallelism is not None and query_parallelism < -1: raise ValueError("query_parallelism must be >= -1") + if approx_mode not in {"fast", "normal", "accurate"}: + raise ValueError( + "approx_mode must be one of 'fast', 'normal', or 'accurate', " + f"got {approx_mode!r}" + ) + if distance_range is not None: if len(distance_range) != 2: raise ValueError( @@ -7542,6 +7565,7 @@ def _build_vector_search_query( "use_index": use_index, "ef": ef, "query_parallelism": query_parallelism, + "approx_mode": approx_mode, "distance_range": distance_range, } @@ -7698,6 +7722,7 @@ def __init__( use_index: bool = True, ef: Optional[int] = None, query_parallelism: Optional[int] = None, + approx_mode: Literal["fast", "normal", "accurate"] = "normal", ): self._inner = _build_vector_search_query( column, @@ -7711,6 +7736,7 @@ def __init__( use_index=use_index, ef=ef, query_parallelism=query_parallelism, + approx_mode=approx_mode, ) def inner(self): diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 2e4a3dd4648..292b8079706 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1101,6 +1101,17 @@ def test_create_ivf_rq_multi_bit_searches_l2_and_cosine(): stats = ds.stats.index_stats("vector_idx") assert stats["indices"][0]["sub_index"]["num_bits"] == 9 assert stats["indices"][0]["sub_index"]["query_estimator"] == "raw_query" + for approx_mode in ["fast", "normal", "accurate"]: + result = ds.to_table( + nearest={ + "column": "vector", + "q": mat[0], + "k": 10, + "approx_mode": approx_mode, + }, + columns=["id"], + ) + assert result.num_rows == 10 cosine_ds = lance.write_dataset(tbl, "memory://") cosine_ds = _assert_recall_at_least(cosine_ds, mat[1], metric="cosine") @@ -2097,6 +2108,33 @@ def test_vector_index_invalid_query_parallelism(indexed_dataset): ) +def test_vector_index_with_approx_mode(indexed_dataset): + q = np.random.randn(128) + + for approx_mode in ["fast", "normal", "accurate"]: + result = indexed_dataset.to_table( + nearest={ + "column": "vector", + "q": q, + "k": 10, + "approx_mode": approx_mode, + } + ) + assert len(result) == 10 + + +def test_vector_index_invalid_approx_mode(indexed_dataset): + with pytest.raises(ValueError, match="approx_mode"): + indexed_dataset.scanner( + nearest={ + "column": "vector", + "q": np.random.randn(128), + "k": 10, + "approx_mode": "hacc", + } + ) + + def test_knn_deleted_rows(tmp_path): data = create_table() ds = lance.write_dataset(data, tmp_path) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index f57f36d9202..25e73a9412e 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -78,8 +78,9 @@ use lance_index::{ progress::{IndexBuildProgress, NoopIndexBuildProgress}, scalar::{FullTextSearchQuery, InvertedIndexParams, ScalarIndexParams}, vector::{ - DEFAULT_QUERY_PARALLELISM, Query as VectorQuery, hnsw::builder::HnswBuildParams, - ivf::IvfBuildParams, pq::PQBuildParams, sq::builder::SQBuildParams, + ApproxMode, DEFAULT_QUERY_PARALLELISM, Query as VectorQuery, + hnsw::builder::HnswBuildParams, ivf::IvfBuildParams, pq::PQBuildParams, + sq::builder::SQBuildParams, }, }; use lance_io::object_store::{ @@ -1227,6 +1228,7 @@ impl Dataset { use_index, ef, query_parallelism, + approx_mode, ) = vector_query_params_from_dict(nearest, default_k)?; let (_, element_type) = get_vector_type(self_.ds.schema(), &column) @@ -1293,6 +1295,7 @@ impl Dataset { s = s.ef(ef); } s = s.query_parallelism(query_parallelism); + s = s.approx_mode(approx_mode); s.use_index(use_index); if let Some((lower, upper)) = distance_range { s.distance_range(lower, upper); @@ -4706,6 +4709,7 @@ type VectorQueryParams = ( bool, Option, i32, + ApproxMode, ); fn extract_query_parallelism(value: &Bound<'_, PyAny>) -> PyResult { @@ -4727,6 +4731,23 @@ fn vector_query_query_parallelism_from_dict(dict: &Bound<'_, PyDict>) -> PyResul } } +fn vector_query_approx_mode_from_dict(dict: &Bound<'_, PyDict>) -> PyResult { + if let Some(approx_mode) = dict.get_item("approx_mode")? + && !approx_mode.is_none() + { + match approx_mode.to_string().to_lowercase().as_str() { + "fast" => Ok(ApproxMode::Fast), + "normal" => Ok(ApproxMode::Normal), + "accurate" => Ok(ApproxMode::Accurate), + value => Err(PyValueError::new_err(format!( + "approx_mode must be one of 'fast', 'normal', or 'accurate', got '{value}'" + ))), + } + } else { + Ok(ApproxMode::Normal) + } +} + fn vector_query_params_from_dict( dict: &Bound<'_, PyDict>, default_k: usize, @@ -4833,6 +4854,7 @@ fn vector_query_params_from_dict( }; let query_parallelism = vector_query_query_parallelism_from_dict(dict)?; + let approx_mode = vector_query_approx_mode_from_dict(dict)?; Ok(( column, @@ -4845,6 +4867,7 @@ fn vector_query_params_from_dict( use_index, ef, query_parallelism, + approx_mode, )) } @@ -4881,6 +4904,7 @@ impl PySearchFilter { use_index, ef, query_parallelism, + approx_mode, ) = vector_query_params_from_dict(query, default_k)?; let metric_type = Some(metric_type_opt.unwrap_or(MetricType::L2)); @@ -4899,6 +4923,7 @@ impl PySearchFilter { use_index, query_parallelism, dist_q_c: 0.0, + approx_mode, }; Ok(Self { diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index 99f3acf1cb9..d0df2fcb7e2 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -82,6 +82,23 @@ pub static CENTROID_DIST_FIELD: LazyLock = LazyLock::new(|| pub const DEFAULT_QUERY_PARALLELISM: i32 = 0; +/// Controls the speed / accuracy tradeoff for approximate vector search. +/// +/// This currently only affects RQ-quantized vector indexes, such as IVF_RQ. +/// Other index types ignore this setting. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum ApproxMode { + /// Prefer lower query latency, which can reduce recall. + Fast, + + /// Use the default balance between query latency and recall. + #[default] + Normal, + + /// Prefer higher recall, which can increase query latency. + Accurate, +} + /// Query parameters for the vector indices #[derive(Debug, Clone)] @@ -141,6 +158,12 @@ pub struct Query { /// the distance between the query and the centroid /// this is only used for IVF index with Rabit quantization pub dist_q_c: f32, + + /// Controls the speed / accuracy tradeoff for approximate vector search. + /// + /// This currently only affects RQ-quantized vector indexes, such as IVF_RQ. + /// Other index types ignore this setting. + pub approx_mode: ApproxMode, } impl From for DistanceType { diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index d8ef06ce92f..bd70f176c5d 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -40,6 +40,7 @@ use serde::{Deserialize, Serialize}; use crate::frag_reuse::FragReuseIndex; use crate::pb; +use crate::vector::ApproxMode; use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place}; use crate::vector::bq::transform::{ ADD_FACTORS_COLUMN, ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, @@ -52,7 +53,9 @@ use crate::vector::bq::{ use crate::vector::graph::{OrderedFloat, OrderedNode}; use crate::vector::pq::storage::transpose; use crate::vector::quantizer::{QuantizerMetadata, QuantizerStorage}; -use crate::vector::storage::{DistCalculator, QueryResidual, RabitRawQueryContext, VectorStore}; +use crate::vector::storage::{ + DistCalculator, DistanceCalculatorOptions, QueryResidual, RabitRawQueryContext, VectorStore, +}; pub const RABIT_METADATA_KEY: &str = "lance:rabit"; pub const RABIT_CODE_COLUMN: &str = "_rabit_codes"; @@ -552,13 +555,17 @@ impl RabitQuantizationStorage { fn distance_calculator_from_parts<'a>( &'a self, - dim: usize, - dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, - sum_q: f32, - query_factor: f32, - query_error: f32, + parts: RabitDistCalculatorParts<'a>, ) -> RabitDistCalculator<'a> { + let RabitDistCalculatorParts { + dim, + dist_table, + ex_dist_table, + sum_q, + query_factor, + query_error, + approx_mode, + } = parts; let ex_codes = self .ex_codes .as_ref() @@ -590,6 +597,7 @@ impl RabitQuantizationStorage { packed_ex_codes, query_factor, query_error, + approx_mode, ) } @@ -756,6 +764,16 @@ fn copy_subtract_f32(lhs: &[f32], rhs: &[f32], output: &mut [f32]) { } } +struct RabitDistCalculatorParts<'a> { + dim: usize, + dist_table: Cow<'a, [f32]>, + ex_dist_table: Cow<'a, [f32]>, + sum_q: f32, + query_factor: f32, + query_error: f32, + approx_mode: ApproxMode, +} + pub struct RabitDistCalculator<'a> { dim: usize, num_bits: u8, @@ -776,6 +794,7 @@ pub struct RabitDistCalculator<'a> { packed_ex_codes: Option<&'a [u8]>, query_factor: f32, query_error: f32, + approx_mode: ApproxMode, sum_q: f32, sqrt_d: f32, @@ -800,6 +819,7 @@ impl<'a> RabitDistCalculator<'a> { packed_ex_codes: Option<&'a [u8]>, query_factor: f32, query_error: f32, + approx_mode: ApproxMode, ) -> Self { Self { dim, @@ -817,6 +837,7 @@ impl<'a> RabitDistCalculator<'a> { packed_ex_codes, query_factor, query_error, + approx_mode, sqrt_d: (dim as f32 * num_bits as f32).sqrt(), sum_q, } @@ -830,7 +851,19 @@ impl<'a> RabitDistCalculator<'a> { dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, ) -> usize { + if self.approx_mode == ApproxMode::Accurate { + return self.binary_distances_hacc_with_scratch( + n, + code_len, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ); + } + let (qmin, qmax) = quantize_dist_table_into(&self.dist_table, quantized_dists_table); let remainder = n % BATCH_SIZE; let simd_len = n - remainder; @@ -881,48 +914,40 @@ impl<'a> RabitDistCalculator<'a> { simd_len } - #[inline] - fn binary_distance_factor_params(&self) -> (f32, f32) { - match self.query_estimator { - RabitQueryEstimator::ResidualQuery => (2.0 / self.sqrt_d, -self.sum_q / self.sqrt_d), - RabitQueryEstimator::RawQuery => (1.0, -0.5 * self.sum_q), - } - } - #[allow(clippy::uninit_vec)] - fn one_bit_distances_with_scratch( + fn binary_distances_hacc_with_scratch( &self, n: usize, code_len: usize, dists: &mut Vec, - quantized_dists: &mut Vec, - quantized_dists_table: &mut Vec, - ) { - let (qmin, qmax) = quantize_dist_table_into(&self.dist_table, quantized_dists_table); + quantized_dist_table: &mut Vec, + hacc_dist_table: &mut Vec, + quantized_dists: &mut Vec, + ) -> usize { + let (qmin, qmax) = quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table); + simd::dist_table::transfer_4bit_dist_table_u16(quantized_dist_table, hacc_dist_table); let remainder = n % BATCH_SIZE; let simd_len = n - remainder; quantized_dists.clear(); quantized_dists.reserve(simd_len); - // SAFETY: sum_4bit_dist_table overwrites each element in the SIMD batch range. + // SAFETY: sum_4bit_hacc_dist_table overwrites each element in the batch range. unsafe { quantized_dists.set_len(simd_len); } - simd::dist_table::sum_4bit_dist_table( + simd::dist_table::sum_4bit_hacc_dist_table( simd_len, code_len, self.codes, - quantized_dists_table, + hacc_dist_table, quantized_dists, ); - let range = (qmax - qmin) / 255.0; - let num_tables = quantized_dists_table.len() / SEGMENT_NUM_CODES; + let range = (qmax - qmin) / u16::MAX as f32; + let num_tables = quantized_dist_table.len() / SEGMENT_NUM_CODES; let sum_min = num_tables as f32 * qmin; - let (binary_distance_multiplier, binary_distance_offset) = - self.binary_distance_factor_params(); dists.clear(); dists.reserve(n); - // SAFETY: the SIMD section below writes [0, simd_len), and the + // SAFETY: the batch section writes [0, simd_len), and the // remainder section writes [simd_len, n). unsafe { dists.set_len(n); @@ -931,27 +956,60 @@ impl<'a> RabitDistCalculator<'a> { simd_dists .iter_mut() .zip(quantized_dists.iter()) - .enumerate() - .for_each(|(id, (dist, q_dist))| { - let binary_dist = (*q_dist as f32) * range + sum_min; - *dist = (binary_dist * binary_distance_multiplier + binary_distance_offset) - * self.scale_factors[id] - + self.add_factors[id] - + self.query_factor; + .for_each(|(dist, q_dist)| { + *dist = (*q_dist as f32) * range + sum_min; }); remainder_dists .iter_mut() .enumerate() - .for_each(|(offset, dist)| { - let id = simd_len + offset; - let binary_dist = - compute_single_rq_distance(self.codes, id, n, code_len, &self.dist_table); - *dist = (binary_dist * binary_distance_multiplier + binary_distance_offset) - * self.scale_factors[id] - + self.add_factors[id] - + self.query_factor; + .for_each(|(id, dist)| { + *dist = compute_single_rq_distance( + self.codes, + simd_len + id, + n, + code_len, + &self.dist_table, + ); }); + simd_len + } + + #[inline] + fn binary_distance_factor_params(&self) -> (f32, f32) { + match self.query_estimator { + RabitQueryEstimator::ResidualQuery => (2.0 / self.sqrt_d, -self.sum_q / self.sqrt_d), + RabitQueryEstimator::RawQuery => (1.0, -0.5 * self.sum_q), + } + } + + #[allow(clippy::uninit_vec)] + fn one_bit_distances_with_scratch( + &self, + n: usize, + code_len: usize, + dists: &mut Vec, + quantized_dists: &mut Vec, + quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, + ) { + self.binary_distances_with_scratch( + n, + code_len, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ); + let (binary_distance_multiplier, binary_distance_offset) = + self.binary_distance_factor_params(); + dists.iter_mut().enumerate().for_each(|(id, dist)| { + let binary_dist = *dist; + *dist = (binary_dist * binary_distance_multiplier + binary_distance_offset) + * self.scale_factors[id] + + self.add_factors[id] + + self.query_factor; + }); } #[allow(clippy::uninit_vec)] @@ -1092,12 +1150,14 @@ impl<'a> RabitDistCalculator<'a> { dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, ) { let code_len = rabit_binary_code_bytes(self.dim); let n = self.codes.len() / code_len; if n == 0 { dists.clear(); quantized_dists.clear(); + hacc_quantized_dists.clear(); return; } @@ -1107,6 +1167,7 @@ impl<'a> RabitDistCalculator<'a> { dists, quantized_dists, quantized_dists_table, + hacc_quantized_dists, ); let ex_bits = self.num_bits - 1; @@ -1183,7 +1244,9 @@ impl<'a> RabitDistCalculator<'a> { } fn raw_query_lower_bound_gating_disabled_reason(&self) -> Option<&'static str> { - if self.query_estimator != RabitQueryEstimator::RawQuery { + if self.approx_mode == ApproxMode::Fast { + Some("approx_mode_fast") + } else if self.query_estimator != RabitQueryEstimator::RawQuery { Some("residual_query_estimator") } else if self.num_bits <= 1 { Some("num_bits_le_one") @@ -1306,6 +1369,38 @@ fn quantize_dist_table_into(dist_table: &[f32], quantized_dist_table: &mut Vec, +) -> (f32, f32) { + let (qmin, qmax) = dist_table + .iter() + .cloned() + .minmax_by(|a, b| a.total_cmp(b)) + .into_option() + .unwrap(); + if qmin == qmax { + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return (qmin, qmax); + } + + let factor = u16::MAX as f32 / (qmax - qmin); + quantized_dist_table.clear(); + quantized_dist_table.reserve(dist_table.len()); + let spare = quantized_dist_table.spare_capacity_mut(); + for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) { + quantized.write(((d - qmin) * factor).round() as u16); + } + // SAFETY: every element in the reserved range was initialized in the loop above. + unsafe { + quantized_dist_table.set_len(dist_table.len()); + } + + (qmin, qmax) +} + #[inline] fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 { debug_assert!(ex_bits > 0); @@ -1472,11 +1567,8 @@ impl DistCalculator for RabitDistCalculator<'_> { } RabitQueryEstimator::RawQuery => { let ex_bits = self.num_bits - 1; - if ex_bits == 0 { - let binary_dot = dist - 0.5 * self.sum_q; - return binary_dot * self.scale_factors[id] - + self.add_factors[id] - + self.query_factor; + if ex_bits == 0 || self.approx_mode == ApproxMode::Fast { + return self.raw_query_binary_distance(id, dist); } let ex_codes = self @@ -1508,11 +1600,13 @@ impl DistCalculator for RabitDistCalculator<'_> { let mut dists = Vec::new(); let mut quantized_dists = Vec::new(); let mut quantized_dists_table = Vec::new(); + let mut hacc_quantized_dists = Vec::new(); self.distance_all_with_scratch( 0, &mut dists, &mut quantized_dists, &mut quantized_dists_table, + &mut hacc_quantized_dists, ); dists } @@ -1525,6 +1619,7 @@ impl DistCalculator for RabitDistCalculator<'_> { dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, ) { let code_len = rabit_binary_code_bytes(self.dim); let n = self.codes.len() / code_len; @@ -1534,13 +1629,17 @@ impl DistCalculator for RabitDistCalculator<'_> { return; } - if self.query_estimator == RabitQueryEstimator::ResidualQuery || self.num_bits == 1 { + if self.query_estimator == RabitQueryEstimator::ResidualQuery + || self.num_bits == 1 + || self.approx_mode == ApproxMode::Fast + { self.one_bit_distances_with_scratch( n, code_len, dists, quantized_dists, quantized_dists_table, + hacc_quantized_dists, ); return; } @@ -1551,6 +1650,7 @@ impl DistCalculator for RabitDistCalculator<'_> { dists, quantized_dists, quantized_dists_table, + hacc_quantized_dists, ); self.apply_raw_query_multi_bit_distances( @@ -1572,13 +1672,20 @@ impl DistCalculator for RabitDistCalculator<'_> { dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, ) { if k == 0 { return; } if let Some(reason) = self.raw_query_lower_bound_gating_disabled_reason() { record_rabit_prune_bypass(reason); - self.distance_all_with_scratch(k, dists, quantized_dists, quantized_dists_table); + self.distance_all_with_scratch( + k, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ); accumulate_distances_into_heap(k, lower_bound, upper_bound, row_id, res, dists); return; } @@ -1594,6 +1701,7 @@ impl DistCalculator for RabitDistCalculator<'_> { dists, quantized_dists, quantized_dists_table, + hacc_quantized_dists, ); } @@ -1609,13 +1717,20 @@ impl DistCalculator for RabitDistCalculator<'_> { dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, ) { if k == 0 { return; } if let Some(reason) = self.raw_query_lower_bound_gating_disabled_reason() { record_rabit_prune_bypass(reason); - self.distance_all_with_scratch(k, dists, quantized_dists, quantized_dists_table); + self.distance_all_with_scratch( + k, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ); accumulate_filtered_distances_into_heap( k, lower_bound, @@ -1639,6 +1754,7 @@ impl DistCalculator for RabitDistCalculator<'_> { dists, quantized_dists, quantized_dists_table, + hacc_quantized_dists, ); } } @@ -1763,14 +1879,15 @@ impl VectorStore for RabitQuantizationStorage { }; let sum_q = rotated_qr.into_iter().sum(); - self.distance_calculator_from_parts( - code_dim, - Cow::Owned(dist_table), - Cow::Owned(ex_dist_table), + self.distance_calculator_from_parts(RabitDistCalculatorParts { + dim: code_dim, + dist_table: Cow::Owned(dist_table), + ex_dist_table: Cow::Owned(ex_dist_table), sum_q, query_factor, query_error, - ) + approx_mode: ApproxMode::Normal, + }) } // qr = (q-c) @@ -1781,6 +1898,7 @@ impl VectorStore for RabitQuantizationStorage { dist_q_c: f32, residual: Option>, f32_scratch: &'a mut Vec, + options: DistanceCalculatorOptions, ) -> Self::DistanceCalculator<'a> { let code_dim = self.code_dim(); if let ( @@ -1800,14 +1918,15 @@ impl VectorStore for RabitQuantizationStorage { &raw_query.rotated_query, rotated_centroid, ); - return self.distance_calculator_from_parts( - code_dim, - Cow::Borrowed(&raw_query.dist_table), - Cow::Borrowed(&raw_query.ex_dist_table), - raw_query.sum_q, + return self.distance_calculator_from_parts(RabitDistCalculatorParts { + dim: code_dim, + dist_table: Cow::Borrowed(&raw_query.dist_table), + ex_dist_table: Cow::Borrowed(&raw_query.ex_dist_table), + sum_q: raw_query.sum_q, query_factor, query_error, - ); + approx_mode: options.approx_mode, + }); } let dist_table_len = code_dim * 4; @@ -1866,17 +1985,18 @@ impl VectorStore for RabitQuantizationStorage { rotated_qr.iter().copied().sum() }; - self.distance_calculator_from_parts( - code_dim, - Cow::Borrowed(&f32_scratch[code_dim..code_dim + dist_table_len]), - Cow::Borrowed( + self.distance_calculator_from_parts(RabitDistCalculatorParts { + dim: code_dim, + dist_table: Cow::Borrowed(&f32_scratch[code_dim..code_dim + dist_table_len]), + ex_dist_table: Cow::Borrowed( &f32_scratch [code_dim + dist_table_len..code_dim + dist_table_len + ex_dist_table_len], ), sum_q, query_factor, query_error, - ) + approx_mode: options.approx_mode, + }) } // TODO: implement this @@ -2420,8 +2540,13 @@ mod tests { let mut scratch = Vec::with_capacity(expected_scratch_len); let initial_ptr = scratch.as_ptr(); { - let calc = - storage.dist_calculator_with_scratch(query.clone(), 0.25, None, &mut scratch); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 0.25, + None, + &mut scratch, + DistanceCalculatorOptions::default(), + ); assert_eq!(calc.distance_all(0), expected); } assert_eq!(scratch.len(), expected_scratch_len); @@ -2429,7 +2554,13 @@ mod tests { scratch.fill(f32::NAN); { - let calc = storage.dist_calculator_with_scratch(query, 0.25, None, &mut scratch); + let calc = storage.dist_calculator_with_scratch( + query, + 0.25, + None, + &mut scratch, + DistanceCalculatorOptions::default(), + ); assert_eq!(calc.distance_all(0), expected); } assert_eq!(scratch.as_ptr(), initial_ptr); @@ -2470,6 +2601,7 @@ mod tests { 0.25, Some(QueryResidual::Centroid(centroid.as_ref())), &mut scratch, + DistanceCalculatorOptions::default(), ); assert_eq!(calc.distance_all(0), expected); @@ -2508,6 +2640,7 @@ mod tests { 0.25, Some(QueryResidual::Centroid(centroid.as_ref())), &mut scratch, + DistanceCalculatorOptions::default(), ); assert_eq!(calc.distance_all(0), expected); @@ -2754,10 +2887,180 @@ mod tests { let mut distances = Vec::new(); let mut u16_scratch = Vec::new(); let mut u8_scratch = Vec::new(); - calc.distance_all_with_scratch(0, &mut distances, &mut u16_scratch, &mut u8_scratch); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); assert_eq!(distances, vec![104.0, 22.0]); } + #[test] + fn test_fast_approx_mode_uses_one_bit_scores_for_multi_bit_raw_query() { + let code_dim = 8usize; + let identity = Float32Array::from_iter_values( + (0..code_dim) + .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })), + ); + let rotate_mat = + FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap(); + let metadata = RabitQuantizationMetadata { + rotate_mat: Some(rotate_mat), + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Matrix, + code_dim: code_dim as u32, + num_bits: 2, + packed: false, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let codes = + FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![0xff, 0xff]), 1).unwrap(); + let ex_codes = + FixedSizeListArray::try_new_from_values(UInt8Array::from(vec![0x00, 0xff]), 1).unwrap(); + let batch = make_test_batch_with_ex(codes, ex_codes) + .replace_column_by_name( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0, 0.0])), + ) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef; + let normal = storage.dist_calculator(query.clone(), 0.0).distance_all(0); + + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query, + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { + approx_mode: ApproxMode::Fast, + }, + ); + let mut distances = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + + let expected_fast = (0..2) + .map(|id| calc.distance(id as u32)) + .collect::>(); + assert_ne!(normal, distances); + assert_eq!(distances, expected_fast); + assert_eq!( + calc.raw_query_lower_bound_gating_disabled_reason(), + Some("approx_mode_fast") + ); + } + + #[test] + fn test_accurate_approx_mode_reduces_binary_lut_quantization_error() { + let code_dim = 64usize; + let num_rows = BATCH_SIZE; + let original_codes = make_test_codes(num_rows, code_dim as i32); + let metadata = make_test_metadata(code_dim); + let storage = RabitQuantizationStorage::try_from_batch( + make_test_batch(original_codes), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(); + let query = Arc::new(Float32Array::from_iter_values( + (0..code_dim).map(|idx| (idx as f32 * 0.137).sin() + idx as f32 * 0.003), + )) as ArrayRef; + let exact_calc = storage.dist_calculator(query.clone(), 0.0); + let exact = (0..num_rows) + .map(|id| exact_calc.distance(id as u32)) + .collect::>(); + + let normal = { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions::default(), + ); + let mut distances = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + distances + }; + + let (accurate, hacc_table_len, hacc_packed_table_len, hacc_accum_len) = { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query, + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { + approx_mode: ApproxMode::Accurate, + }, + ); + let mut distances = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + ( + distances, + u16_scratch.len(), + u8_scratch.len(), + u32_scratch.len(), + ) + }; + + let normal_error = normal + .iter() + .zip(exact.iter()) + .map(|(actual, expected)| (actual - expected).abs()) + .sum::(); + let accurate_error = accurate + .iter() + .zip(exact.iter()) + .map(|(actual, expected)| (actual - expected).abs()) + .sum::(); + + assert!(normal_error > 0.0); + assert!( + accurate_error < normal_error, + "accurate_error={accurate_error}, normal_error={normal_error}" + ); + assert_eq!(hacc_table_len, code_dim * 4); + assert_eq!(hacc_packed_table_len, code_dim * 8); + assert_eq!(hacc_accum_len, num_rows); + } + fn assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits: u8) { let code_dim = 8usize; let num_rows = BATCH_SIZE + 1; @@ -2826,7 +3129,14 @@ mod tests { let mut distances = Vec::new(); let mut u16_scratch = Vec::new(); let mut u8_scratch = Vec::new(); - calc.distance_all_with_scratch(0, &mut distances, &mut u16_scratch, &mut u8_scratch); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); assert_eq!(distances.len(), num_rows); assert_eq!(u16_scratch.len(), BATCH_SIZE); @@ -2906,12 +3216,14 @@ mod tests { let mut binary_ips = Vec::new(); let mut binary_u16_scratch = Vec::new(); let mut binary_u8_scratch = Vec::new(); + let mut binary_u32_scratch = Vec::new(); calc.binary_distances_with_scratch( num_rows, rabit_binary_code_bytes(code_dim), &mut binary_ips, &mut binary_u16_scratch, &mut binary_u8_scratch, + &mut binary_u32_scratch, ); let ex_codes = calc.ex_codes.unwrap(); let ex_add_factors = calc.ex_add_factors.unwrap(); @@ -2947,6 +3259,7 @@ mod tests { let mut distances = Vec::new(); let mut u16_scratch = Vec::new(); let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); calc.accumulate_topk_with_scratch( k, None, @@ -2956,6 +3269,7 @@ mod tests { &mut distances, &mut u16_scratch, &mut u8_scratch, + &mut u32_scratch, ); let mut actual = heap .into_iter() @@ -3053,6 +3367,7 @@ mod tests { query: None, }), &mut fallback_scratch, + DistanceCalculatorOptions::default(), ) .distance_all(0); @@ -3066,6 +3381,7 @@ mod tests { query: Some(&raw_query), }), &mut prepared_scratch, + DistanceCalculatorOptions::default(), ) .distance_all(0); diff --git a/rust/lance-index/src/vector/flat/index.rs b/rust/lance-index/src/vector/flat/index.rs index 4b7ff271d19..2c099eb5435 100644 --- a/rust/lance-index/src/vector/flat/index.rs +++ b/rust/lance-index/src/vector/flat/index.rs @@ -20,10 +20,12 @@ use crate::{ metrics::MetricsCollector, prefilter::PreFilter, vector::{ - DIST_COL, Query, + ApproxMode, DIST_COL, Query, graph::{OrderedFloat, OrderedNode}, quantizer::{Quantization, QuantizationType, Quantizer, QuantizerMetadata}, - storage::{DistCalculator, QueryResidual, QueryScratch, VectorStore}, + storage::{ + DistCalculator, DistanceCalculatorOptions, QueryResidual, QueryScratch, VectorStore, + }, v3::subindex::IvfSubIndex, }, }; @@ -68,6 +70,7 @@ pub struct FlatQueryParams { lower_bound: Option, upper_bound: Option, dist_q_c: f32, + approx_mode: ApproxMode, } impl From<&Query> for FlatQueryParams { @@ -76,6 +79,7 @@ impl From<&Query> for FlatQueryParams { lower_bound: q.lower_bound, upper_bound: q.upper_bound, dist_q_c: q.dist_q_c, + approx_mode: q.approx_mode, } } } @@ -136,6 +140,9 @@ impl IvfSubIndex for FlatIndex { params.dist_q_c, residual, &mut scratch.query_f32, + DistanceCalculatorOptions { + approx_mode: params.approx_mode, + }, ); let mut res = BinaryHeap::with_capacity(k); metrics.record_comparisons(storage.len()); @@ -147,6 +154,7 @@ impl IvfSubIndex for FlatIndex { &mut scratch.distances, &mut scratch.u16, &mut scratch.u8, + &mut scratch.u32, ); let dists = scratch.distances.iter().copied(); @@ -254,6 +262,9 @@ impl IvfSubIndex for FlatIndex { params.dist_q_c, residual, &mut scratch.query_f32, + DistanceCalculatorOptions { + approx_mode: params.approx_mode, + }, ); metrics.record_comparisons(storage.len()); @@ -268,6 +279,7 @@ impl IvfSubIndex for FlatIndex { &mut scratch.distances, &mut scratch.u16, &mut scratch.u8, + &mut scratch.u32, ); } false => { @@ -282,6 +294,7 @@ impl IvfSubIndex for FlatIndex { &mut scratch.distances, &mut scratch.u16, &mut scratch.u8, + &mut scratch.u32, ); } }; diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index ff9bc372464..b036e187b77 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -35,10 +35,10 @@ use crate::{ }, }; -use super::DISTANCE_TYPE_KEY; use super::graph::OrderedFloat; use super::graph::OrderedNode; use super::quantizer::{Quantizer, QuantizerMetadata}; +use super::{ApproxMode, DISTANCE_TYPE_KEY}; ///

/// Internal API @@ -59,6 +59,7 @@ pub trait DistCalculator { dists: &mut Vec, _u16_scratch: &mut Vec, _u8_scratch: &mut Vec, + _u32_scratch: &mut Vec, ) { *dists = self.distance_all(k_hint); } @@ -76,12 +77,13 @@ pub trait DistCalculator { dists: &mut Vec, u16_scratch: &mut Vec, u8_scratch: &mut Vec, + u32_scratch: &mut Vec, ) { if k == 0 { return; } - self.distance_all_with_scratch(k, dists, u16_scratch, u8_scratch); + self.distance_all_with_scratch(k, dists, u16_scratch, u8_scratch, u32_scratch); let lower_bound = lower_bound.unwrap_or(f32::MIN).into(); let upper_bound = upper_bound.unwrap_or(f32::MAX).into(); let mut max_dist = res.peek().map(|node| node.dist); @@ -116,6 +118,7 @@ pub trait DistCalculator { _dists: &mut Vec, _u16_scratch: &mut Vec, _u8_scratch: &mut Vec, + _u32_scratch: &mut Vec, ) { if k == 0 { return; @@ -155,6 +158,7 @@ pub struct QueryScratch { pub query_f32: Vec, pub u16: Vec, pub u8: Vec, + pub u32: Vec, } impl QueryScratch { @@ -164,6 +168,7 @@ impl QueryScratch { query_f32: Vec::new(), u16: Vec::new(), u8: Vec::new(), + u32: Vec::new(), } } @@ -173,6 +178,7 @@ impl QueryScratch { query_f32: vec![0.0; capacity.query_f32], u16: vec![0; capacity.u16], u8: vec![0; capacity.u8], + u32: vec![0; capacity.u32], } } } @@ -189,6 +195,7 @@ impl DeepSizeOf for QueryScratch { + self.query_f32.capacity() * size_of::() + self.u16.capacity() * size_of::() + self.u8.capacity() * size_of::() + + self.u32.capacity() * size_of::() } } @@ -198,15 +205,27 @@ pub struct QueryScratchCapacity { pub query_f32: usize, pub u16: usize, pub u8: usize, + pub u32: usize, } impl QueryScratchCapacity { pub const fn new(distances: usize, query_f32: usize, u16: usize, u8: usize) -> Self { + Self::new_with_u32(distances, query_f32, u16, u8, 0) + } + + pub const fn new_with_u32( + distances: usize, + query_f32: usize, + u16: usize, + u8: usize, + u32: usize, + ) -> Self { Self { distances, query_f32, u16, u8, + u32, } } @@ -215,9 +234,15 @@ impl QueryScratchCapacity { + self.query_f32 * size_of::() + self.u16 * size_of::() + self.u8 * size_of::() + + self.u32 * size_of::() } } +#[derive(Clone, Copy, Debug, Default)] +pub struct DistanceCalculatorOptions { + pub approx_mode: ApproxMode, +} + #[derive(Debug)] pub struct RabitRawQueryContext { pub code_dim: usize, @@ -393,6 +418,7 @@ pub trait VectorStore: Send + Sync + Sized + Clone { dist_q_c: f32, _residual: Option>, _f32_scratch: &'a mut Vec, + _options: DistanceCalculatorOptions, ) -> Self::DistanceCalculator<'a> { self.dist_calculator(query, dist_q_c) } @@ -641,11 +667,14 @@ mod tests { scratch.u16.resize(4, 3); scratch.u8.clear(); scratch.u8.resize(2, 4); + scratch.u32.clear(); + scratch.u32.resize(3, 5); ( scratch.query_f32.as_ptr(), scratch.distances.as_ptr(), scratch.u16.as_ptr(), scratch.u8.as_ptr(), + scratch.u32.as_ptr(), ) }); @@ -658,11 +687,14 @@ mod tests { assert!(scratch.u16.iter().all(|value| *value == 3)); assert_eq!(scratch.u8.len(), 2); assert!(scratch.u8.iter().all(|value| *value == 4)); + assert_eq!(scratch.u32.len(), 3); + assert!(scratch.u32.iter().all(|value| *value == 5)); ( scratch.query_f32.as_ptr(), scratch.distances.as_ptr(), scratch.u16.as_ptr(), scratch.u8.as_ptr(), + scratch.u32.as_ptr(), ) }); @@ -688,7 +720,8 @@ mod tests { #[test] fn test_query_scratch_pool_uses_temporary_scratch_when_empty() { - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); let pooled = pool.scratch(); assert!(pooled.pooled); @@ -698,12 +731,14 @@ mod tests { assert_eq!(temporary.query_f32.len(), 16); assert_eq!(temporary.u16.len(), 4); assert_eq!(temporary.u8.len(), 2); + assert_eq!(temporary.u32.len(), 3); } #[test] fn test_query_scratch_pool_deep_size_includes_buffer_capacity() { let empty_size = QueryScratchPool::new(1).deep_size_of(); - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); assert!(pool.deep_size_of() > empty_size); @@ -715,7 +750,8 @@ mod tests { #[test] fn test_query_scratch_pool_initializes_buffer_capacity() { - let pool = QueryScratchPool::with_capacity(1, QueryScratchCapacity::new(8, 16, 4, 2)); + let pool = + QueryScratchPool::with_capacity(1, QueryScratchCapacity::new_with_u32(8, 16, 4, 2, 3)); pool.with_scratch(|scratch| { assert_eq!(scratch.distances.len(), 8); @@ -726,6 +762,8 @@ mod tests { assert_eq!(scratch.u16.capacity(), 4); assert_eq!(scratch.u8.len(), 2); assert_eq!(scratch.u8.capacity(), 2); + assert_eq!(scratch.u32.len(), 3); + assert_eq!(scratch.u32.capacity(), 3); }); } } diff --git a/rust/lance-linalg/src/simd/dist_table.rs b/rust/lance-linalg/src/simd/dist_table.rs index addd90381d7..626c1581b15 100644 --- a/rust/lance-linalg/src/simd/dist_table.rs +++ b/rust/lance-linalg/src/simd/dist_table.rs @@ -113,6 +113,274 @@ pub fn sum_4bit_dist_table_scalar( } } +#[inline] +#[allow(unused)] +pub fn sum_4bit_dist_table_u16( + n: usize, + code_len: usize, + codes: &[u8], + dist_table: &[u16], + dists: &mut [u32], +) { + debug_assert!(n.is_multiple_of(BATCH_SIZE)); + debug_assert!(dists.len() >= n); + debug_assert!(codes.len() >= n * code_len); + sum_4bit_dist_table_u16_scalar( + code_len, + &codes[..n * code_len], + dist_table, + &mut dists[..n], + ); +} + +#[inline] +pub fn transfer_4bit_dist_table_u16(dist_table: &[u16], hacc_dist_table: &mut Vec) { + debug_assert!(dist_table.len().is_multiple_of(32)); + + let num_tables = dist_table.len() / 16; + hacc_dist_table.clear(); + hacc_dist_table.resize(dist_table.len() * 2, 0); + + for table_idx in 0..num_tables { + let table = &dist_table[table_idx * 16..(table_idx + 1) * 16]; + let low_offset = (table_idx / 2) * 64 + (table_idx % 2) * 16; + let high_offset = low_offset + 32; + for (code, value) in table.iter().enumerate() { + hacc_dist_table[low_offset + code] = *value as u8; + hacc_dist_table[high_offset + code] = (value >> 8) as u8; + } + } +} + +#[inline] +pub fn sum_4bit_hacc_dist_table( + n: usize, + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + debug_assert!(n.is_multiple_of(BATCH_SIZE)); + debug_assert!(dists.len() >= n); + debug_assert!(codes.len() >= n * code_len); + debug_assert!(hacc_dist_table.len() >= code_len * 64); + + match *SIMD_SUPPORT { + #[cfg(target_arch = "x86_64")] + SimdSupport::Avx512 | SimdSupport::Avx512FP16 | SimdSupport::Avx2 + if std::arch::is_x86_feature_detected!("avx2") => + { + sum_4bit_hacc_dist_table_avx2(n, code_len, codes, hacc_dist_table, dists); + } + _ => sum_4bit_hacc_dist_table_scalar(code_len, codes, hacc_dist_table, dists), + } +} + +#[inline] +#[allow(unused)] +pub fn sum_4bit_hacc_dist_table_scalar( + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + let num_full_vectors = codes.len() / (BATCH_SIZE * code_len) * BATCH_SIZE; + dists[..num_full_vectors].fill(0); + + for (vec_block_idx, blocks) in codes.chunks_exact(BATCH_SIZE * code_len).enumerate() { + for (sub_vec_idx, block) in blocks.chunks_exact(BATCH_SIZE).enumerate() { + let table_offset = sub_vec_idx * 64; + let current_low = &hacc_dist_table[table_offset..table_offset + 16]; + let next_low = &hacc_dist_table[table_offset + 16..table_offset + 32]; + let current_high = &hacc_dist_table[table_offset + 32..table_offset + 48]; + let next_high = &hacc_dist_table[table_offset + 48..table_offset + 64]; + + for j in 0..16 { + let low_current_code = (block[j] & 0x0F) as usize; + let high_current_code = (block[j] >> 4) as usize; + let low_next_code = (block[j + 16] & 0x0F) as usize; + let high_next_code = (block[j + 16] >> 4) as usize; + + let lower_id = vec_block_idx * BATCH_SIZE + PERM0[j]; + let higher_id = lower_id + 16; + dists[lower_id] += ((current_high[low_current_code] as u32) << 8) + + current_low[low_current_code] as u32 + + ((next_high[low_next_code] as u32) << 8) + + next_low[low_next_code] as u32; + dists[higher_id] += ((current_high[high_current_code] as u32) << 8) + + current_low[high_current_code] as u32 + + ((next_high[high_next_code] as u32) << 8) + + next_low[high_next_code] as u32; + } + } + } +} + +#[inline] +#[allow(unused)] +pub fn sum_4bit_dist_table_u16_scalar( + code_len: usize, + codes: &[u8], + dist_table: &[u16], + dists: &mut [u32], +) { + let num_full_vectors = codes.len() / (BATCH_SIZE * code_len) * BATCH_SIZE; + dists[..num_full_vectors].fill(0); + + for (vec_block_idx, blocks) in codes.chunks_exact(BATCH_SIZE * code_len).enumerate() { + for (sub_vec_idx, block) in blocks.chunks_exact(BATCH_SIZE).enumerate() { + let current_dist_table = &dist_table[sub_vec_idx * 2 * 16..(sub_vec_idx * 2 + 1) * 16]; + let next_dist_table = + &dist_table[(sub_vec_idx * 2 + 1) * 16..(sub_vec_idx * 2 + 2) * 16]; + + for j in 0..16 { + let low_current_code = (block[j] & 0x0F) as usize; + let high_current_code = (block[j] >> 4) as usize; + let low_next_code = (block[j + 16] & 0x0F) as usize; + let high_next_code = (block[j + 16] >> 4) as usize; + + let lower_id = vec_block_idx * BATCH_SIZE + PERM0[j]; + let higher_id = lower_id + 16; + dists[lower_id] += current_dist_table[low_current_code] as u32 + + next_dist_table[low_next_code] as u32; + dists[higher_id] += current_dist_table[high_current_code] as u32 + + next_dist_table[high_next_code] as u32; + } + } + } +} + +#[cfg(target_arch = "x86_64")] +#[inline] +fn sum_4bit_hacc_dist_table_avx2( + n: usize, + code_len: usize, + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + const SAFE_CODE_LEN: usize = 128; + + for i in (0..n).step_by(BATCH_SIZE) { + let batch_codes = &codes[i * code_len..(i + BATCH_SIZE) * code_len]; + let batch_dists = &mut dists[i..i + BATCH_SIZE]; + batch_dists.fill(0); + + for code_start in (0..code_len).step_by(SAFE_CODE_LEN) { + let code_end = (code_start + SAFE_CODE_LEN).min(code_len); + let code_range = code_start * BATCH_SIZE..code_end * BATCH_SIZE; + let table_range = code_start * 64..code_end * 64; + if code_start == 0 && code_end == code_len { + unsafe { + sum_hacc_dist_table_32bytes_batch_avx2( + &batch_codes[code_range], + &hacc_dist_table[table_range], + batch_dists, + ); + } + } else { + let mut chunk_dists = [0u32; BATCH_SIZE]; + unsafe { + sum_hacc_dist_table_32bytes_batch_avx2( + &batch_codes[code_range], + &hacc_dist_table[table_range], + &mut chunk_dists, + ); + } + batch_dists + .iter_mut() + .zip(chunk_dists.iter()) + .for_each(|(dist, chunk_dist)| *dist += *chunk_dist); + } + } + } +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +#[inline] +#[allow(unused)] +unsafe fn sum_hacc_dist_table_32bytes_batch_avx2( + codes: &[u8], + hacc_dist_table: &[u8], + dists: &mut [u32], +) { + let low_mask = _mm256_set1_epi8(0x0f); + let mut low_accu0 = _mm256_setzero_si256(); + let mut low_accu1 = _mm256_setzero_si256(); + let mut low_accu2 = _mm256_setzero_si256(); + let mut low_accu3 = _mm256_setzero_si256(); + let mut high_accu0 = _mm256_setzero_si256(); + let mut high_accu1 = _mm256_setzero_si256(); + let mut high_accu2 = _mm256_setzero_si256(); + let mut high_accu3 = _mm256_setzero_si256(); + + for code_offset in (0..codes.len()).step_by(BATCH_SIZE) { + let table_offset = code_offset * 2; + let c = _mm256_loadu_si256(codes.as_ptr().add(code_offset) as *const __m256i); + let lo = _mm256_and_si256(c, low_mask); + let hi = _mm256_and_si256(_mm256_srli_epi16(c, 4), low_mask); + + let low_lut = + _mm256_loadu_si256(hacc_dist_table.as_ptr().add(table_offset) as *const __m256i); + let low_res_lo = _mm256_shuffle_epi8(low_lut, lo); + let low_res_hi = _mm256_shuffle_epi8(low_lut, hi); + low_accu0 = _mm256_add_epi16(low_accu0, low_res_lo); + low_accu1 = _mm256_add_epi16(low_accu1, _mm256_srli_epi16(low_res_lo, 8)); + low_accu2 = _mm256_add_epi16(low_accu2, low_res_hi); + low_accu3 = _mm256_add_epi16(low_accu3, _mm256_srli_epi16(low_res_hi, 8)); + + let high_lut = + _mm256_loadu_si256(hacc_dist_table.as_ptr().add(table_offset + 32) as *const __m256i); + let high_res_lo = _mm256_shuffle_epi8(high_lut, lo); + let high_res_hi = _mm256_shuffle_epi8(high_lut, hi); + high_accu0 = _mm256_add_epi16(high_accu0, high_res_lo); + high_accu1 = _mm256_add_epi16(high_accu1, _mm256_srli_epi16(high_res_lo, 8)); + high_accu2 = _mm256_add_epi16(high_accu2, high_res_hi); + high_accu3 = _mm256_add_epi16(high_accu3, _mm256_srli_epi16(high_res_hi, 8)); + } + + low_accu0 = _mm256_sub_epi16(low_accu0, _mm256_slli_epi16(low_accu1, 8)); + let low_dis0 = _mm256_add_epi16( + _mm256_permute2f128_si256(low_accu0, low_accu1, 0x21), + _mm256_blend_epi32(low_accu0, low_accu1, 0xF0), + ); + low_accu2 = _mm256_sub_epi16(low_accu2, _mm256_slli_epi16(low_accu3, 8)); + let low_dis1 = _mm256_add_epi16( + _mm256_permute2f128_si256(low_accu2, low_accu3, 0x21), + _mm256_blend_epi32(low_accu2, low_accu3, 0xF0), + ); + + high_accu0 = _mm256_sub_epi16(high_accu0, _mm256_slli_epi16(high_accu1, 8)); + let high_dis0 = _mm256_add_epi16( + _mm256_permute2f128_si256(high_accu0, high_accu1, 0x21), + _mm256_blend_epi32(high_accu0, high_accu1, 0xF0), + ); + high_accu2 = _mm256_sub_epi16(high_accu2, _mm256_slli_epi16(high_accu3, 8)); + let high_dis1 = _mm256_add_epi16( + _mm256_permute2f128_si256(high_accu2, high_accu3, 0x21), + _mm256_blend_epi32(high_accu2, high_accu3, 0xF0), + ); + + let low0 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(low_dis0)); + let low1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(low_dis0, 1)); + let high0 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(high_dis0)); + let high1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(high_dis0, 1)); + let res0 = _mm256_add_epi32(low0, _mm256_slli_epi32(high0, 8)); + let res1 = _mm256_add_epi32(low1, _mm256_slli_epi32(high1, 8)); + _mm256_storeu_si256(dists.as_mut_ptr() as *mut __m256i, res0); + _mm256_storeu_si256(dists.as_mut_ptr().add(8) as *mut __m256i, res1); + + let low2 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(low_dis1)); + let low3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(low_dis1, 1)); + let high2 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(high_dis1)); + let high3 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(high_dis1, 1)); + let res2 = _mm256_add_epi32(low2, _mm256_slli_epi32(high2, 8)); + let res3 = _mm256_add_epi32(low3, _mm256_slli_epi32(high3, 8)); + _mm256_storeu_si256(dists.as_mut_ptr().add(16) as *mut __m256i, res2); + _mm256_storeu_si256(dists.as_mut_ptr().add(24) as *mut __m256i, res3); +} + #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] #[inline] @@ -338,6 +606,101 @@ mod tests { assert!(actual.iter().all(|dist| *dist != u16::MAX)); } + #[test] + fn test_sum_4bit_dist_table_u16_basic() { + let n = BATCH_SIZE; + let code_len = 2; + let codes = [ + 0x12, 0x34, 0x56, 0x78, 0x9a, 0xbc, 0xde, 0xf0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, + 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x12, 0x34, 0x56, 0x78, + 0x9a, 0xbc, 0xde, 0xf0, + ]; + let codes = codes.repeat(n * code_len / codes.len()); + let dist_table: Vec = (0..16 * 4).map(|idx| (idx % 16 + 1) as u16).collect(); + + let mut dists = vec![0u32; n]; + sum_4bit_dist_table_u16(n, code_len, &codes, &dist_table, &mut dists); + + assert_eq!(dists[1], 38); + } + + #[test] + fn test_transfer_4bit_dist_table_u16_layout() { + let dist_table: Vec = (0..32).map(|idx| 0x1200 + idx as u16).collect(); + let mut hacc_dist_table = Vec::new(); + transfer_4bit_dist_table_u16(&dist_table, &mut hacc_dist_table); + + assert_eq!(hacc_dist_table.len(), 64); + for code in 0..16 { + assert_eq!(hacc_dist_table[code], dist_table[code] as u8); + assert_eq!(hacc_dist_table[16 + code], dist_table[16 + code] as u8); + assert_eq!(hacc_dist_table[32 + code], (dist_table[code] >> 8) as u8); + assert_eq!( + hacc_dist_table[48 + code], + (dist_table[16 + code] >> 8) as u8 + ); + } + } + + #[test] + fn test_sum_4bit_dist_table_u16_matches_reference_multi_batch() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(99); + + for code_len in [1, 3, 16, 191, 192, 1024] { + let n = BATCH_SIZE * 4; + let codes: Vec = (0..n * code_len).map(|_| rng.random::()).collect(); + let dist_table: Vec = (0..BATCH_SIZE * code_len) + .map(|_| rng.random::()) + .collect(); + + let mut expected = vec![0u32; n]; + sum_4bit_dist_table_u16_scalar(code_len, &codes, &dist_table, &mut expected); + + let mut actual = vec![u32::MAX; n]; + sum_4bit_dist_table_u16(n, code_len, &codes, &dist_table, &mut actual); + + assert_eq!( + actual, + expected, + "u16 dist-table mismatch for code_len={} (DIM={})", + code_len, + code_len * 8, + ); + } + } + + #[test] + fn test_sum_4bit_hacc_dist_table_matches_u16_reference_multi_batch() { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(101); + + for code_len in [1, 3, 16, 191, 192, 1024] { + let n = BATCH_SIZE * 4; + let codes: Vec = (0..n * code_len).map(|_| rng.random::()).collect(); + let dist_table: Vec = (0..BATCH_SIZE * code_len) + .map(|_| rng.random::()) + .collect(); + + let mut hacc_dist_table = Vec::new(); + transfer_4bit_dist_table_u16(&dist_table, &mut hacc_dist_table); + + let mut expected = vec![0u32; n]; + sum_4bit_dist_table_u16_scalar(code_len, &codes, &dist_table, &mut expected); + + let mut actual = vec![u32::MAX; n]; + sum_4bit_hacc_dist_table(n, code_len, &codes, &hacc_dist_table, &mut actual); + + assert_eq!( + actual, + expected, + "hacc dist-table mismatch for code_len={} (DIM={})", + code_len, + code_len * 8, + ); + } + } + /// Test that the SIMD path (NEON on ARM, AVX2 on x86) produces identical /// results to the scalar reference across a range of dimensions, including /// very large ones (up to DIM=65536). diff --git a/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs b/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs index e4232557e64..1b2824bcc83 100644 --- a/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs +++ b/rust/lance/benches/mem_wal/vector/hnsw/mem_wal_recall_hnsw.rs @@ -589,6 +589,7 @@ async fn run_checkpoint( use_index: true, query_parallelism: 1, dist_q_c: 0.0, + approx_mode: Default::default(), }; // IVFIndex::search is intentionally unimplemented (top-level does // partition-aware search); replicate the ANN exec node: pick the diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 4f9ff1323fe..6b19150c17b 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -72,7 +72,7 @@ use lance_index::scalar::inverted::query::{ FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, PhraseQuery, fill_fts_query_column, }; use lance_index::scalar::inverted::{SCORE_COL, SCORE_FIELD}; -use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, DIST_COL, Query}; +use lance_index::vector::{ApproxMode, DEFAULT_QUERY_PARALLELISM, DIST_COL, Query}; use lance_index::{metrics::NoOpMetricsCollector, scalar::inverted::FTS_SCHEMA}; use lance_io::stream::RecordBatchStream; use lance_linalg::distance::MetricType; @@ -1592,6 +1592,7 @@ impl Scanner { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }); self.nearest_query_count = query_count; self.is_batch_nearest = is_batch_nearest; @@ -1760,6 +1761,19 @@ impl Scanner { self } + /// Configure the speed / accuracy tradeoff for approximate vector search. + /// + /// This setting is currently only used by RQ-quantized indexes, such as + /// IVF_RQ. Other index types ignore this setting. + pub fn approx_mode(&mut self, approx_mode: ApproxMode) -> &mut Self { + if let Some(q) = self.nearest.as_mut() { + q.approx_mode = approx_mode; + } else { + log::warn!("approx_mode is not set because nearest has not been called yet"); + } + self + } + /// Configure partition-search concurrency for each vector query. /// /// The default is 0. @@ -10885,6 +10899,26 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") assert_eq!(scanner.nearest_mut().unwrap().query_parallelism, -1); } + #[tokio::test] + async fn test_knn_approx_mode_defaults_and_setter() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) + .await + .unwrap(); + let query_vector = Float32Array::from(vec![0.0; 32]); + let mut scanner = test_ds.dataset.scan(); + scanner.nearest("vec", &query_vector, 5).unwrap(); + assert_eq!( + scanner.nearest_mut().unwrap().approx_mode, + ApproxMode::Normal + ); + + scanner.approx_mode(ApproxMode::Accurate); + assert_eq!( + scanner.nearest_mut().unwrap().approx_mode, + ApproxMode::Accurate + ); + } + #[tokio::test] async fn test_ivf_pq_query_parallelism_returns_same_results() { let mut test_ds = TestVectorDataset::new(LanceFileVersion::Stable, false) diff --git a/rust/lance/src/dataset/tests/dataset_scanner.rs b/rust/lance/src/dataset/tests/dataset_scanner.rs index dcc64aa0632..4c44cb0795b 100644 --- a/rust/lance/src/dataset/tests/dataset_scanner.rs +++ b/rust/lance/src/dataset/tests/dataset_scanner.rs @@ -62,6 +62,7 @@ async fn test_vector_filter_fts_search() { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; // Case 1: search with prefilter=true, query_filter=vector([300,300,300,300]) diff --git a/rust/lance/src/index/vector/fixture_test.rs b/rust/lance/src/index/vector/fixture_test.rs index 182012cb630..91d5c434dd1 100644 --- a/rust/lance/src/index/vector/fixture_test.rs +++ b/rust/lance/src/index/vector/fixture_test.rs @@ -269,6 +269,7 @@ mod test { use_index: true, query_parallelism: lance_index::vector::DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let idx = make_idx.clone()(expected_query_at_subindex, metric).await; let (partition_ids, _) = idx.find_partitions(&q).unwrap(); diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index ff6b03ef051..579990fc03b 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -4870,6 +4870,7 @@ mod tests { use_index: true, query_parallelism: lance_index::vector::DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let (partitions, _) = index.find_partitions(&query).unwrap(); let nearest_partition_id = partitions.value(0) as usize; diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 4315bbb0598..4ea076ed420 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -177,6 +177,23 @@ fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { binary_dist_table_len.max(ex_dist_table_len) } +fn rabit_query_scratch_capacity( + dim: usize, + max_partition_len: usize, + num_bits: u8, +) -> QueryScratchCapacity { + let dist_table_len = dim * 4; + let ex_dist_table_len = rabit_ex_dist_table_len(dim, num_bits); + let u8_scratch_len = rabit_u8_scratch_len(dim, num_bits); + + QueryScratchCapacity::new( + max_partition_len, + dim + dist_table_len + ex_dist_table_len, + max_partition_len.max(dist_table_len), + u8_scratch_len, + ) +} + impl DeepSizeOf for IvfIndexState { fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { self.index_file_path.deep_size_of_children(context) @@ -934,25 +951,13 @@ impl IVFIndex { } let dim = ivf.dimension(); - let dist_table_len = dim * 4; - let (ex_dist_table_len, u8_scratch_len) = match storage.quantizer() { - Ok(Quantizer::Rabit(rq)) => { - let num_bits = rq.metadata_ref().num_bits; - ( - rabit_ex_dist_table_len(dim, num_bits), - rabit_u8_scratch_len(dim, num_bits), - ) - } - _ => (dim * 256, dim * 32), - }; let max_partition_len = ivf.lengths.iter().copied().max().unwrap_or_default() as usize; + let num_bits = match storage.quantizer() { + Ok(Quantizer::Rabit(rq)) => rq.metadata_ref().num_bits, + _ => 9, + }; - QueryScratchCapacity::new( - max_partition_len, - dim + dist_table_len + ex_dist_table_len, - max_partition_len, - u8_scratch_len, - ) + rabit_query_scratch_capacity(dim, max_partition_len, num_bits) } fn use_residual_scratch(ivf: &IvfModel, use_query_residual: bool) -> bool { @@ -1999,6 +2004,20 @@ mod tests { assert_eq!(super::rabit_u8_scratch_len(dim, 9), dim * 32); } + #[test] + fn test_rabit_query_scratch_capacity_does_not_preallocate_u32() { + let dim = 960; + let max_partition_len = 4096; + + let capacity = super::rabit_query_scratch_capacity(dim, max_partition_len, 5); + + assert_eq!(capacity.distances, max_partition_len); + assert_eq!(capacity.query_f32, dim + dim * 4 + dim * 16); + assert_eq!(capacity.u16, max_partition_len); + assert_eq!(capacity.u8, dim * 16); + assert_eq!(capacity.u32, 0); + } + async fn generate_test_dataset( test_uri: &str, range: Range, diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index 1c69cfcd223..a661a314b4d 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -911,6 +911,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; let is_empty_threads = Arc::new(Mutex::new(Vec::new())); let pre_filter = Arc::new(TestPreFilter::with_thread_capture( diff --git a/rust/lance/src/io/exec/ann_proto.rs b/rust/lance/src/io/exec/ann_proto.rs index 18677187cc4..c57ad4ca1b7 100644 --- a/rust/lance/src/io/exec/ann_proto.rs +++ b/rust/lance/src/io/exec/ann_proto.rs @@ -16,7 +16,7 @@ use arrow_array::RecordBatch; use arrow_schema::{Field, Schema as ArrowSchema}; use lance_core::{Error, Result}; use lance_index::pb as index_pb; -use lance_index::vector::{DEFAULT_QUERY_PARALLELISM, Query}; +use lance_index::vector::{ApproxMode, DEFAULT_QUERY_PARALLELISM, Query}; use lance_linalg::distance::DistanceType; use lance_table::format::IndexMetadata; use lance_table::format::pb as table_pb; @@ -80,6 +80,22 @@ fn query_vector_from_ipc_bytes(bytes: &[u8]) -> Result { Ok(batches[0].column(0).clone()) } +fn approx_mode_to_proto(mode: ApproxMode) -> pb::VectorApproxMode { + match mode { + ApproxMode::Fast => pb::VectorApproxMode::Fast, + ApproxMode::Normal => pb::VectorApproxMode::Normal, + ApproxMode::Accurate => pb::VectorApproxMode::Accurate, + } +} + +fn approx_mode_from_proto(value: i32) -> ApproxMode { + match pb::VectorApproxMode::try_from(value).unwrap_or(pb::VectorApproxMode::Normal) { + pb::VectorApproxMode::Fast => ApproxMode::Fast, + pb::VectorApproxMode::Normal => ApproxMode::Normal, + pb::VectorApproxMode::Accurate => ApproxMode::Accurate, + } +} + pub fn query_to_proto(query: &Query) -> Result { let query_vector_arrow_ipc = query_vector_to_ipc_bytes(query.key.as_ref())?; @@ -101,6 +117,7 @@ pub fn query_to_proto(query: &Query) -> Result { use_index: query.use_index, dist_q_c: Some(query.dist_q_c), query_parallelism: Some(query.query_parallelism), + approx_mode: approx_mode_to_proto(query.approx_mode) as i32, }) } @@ -130,6 +147,7 @@ pub fn query_from_proto(proto: pb::VectorQueryProto) -> Result { use_index: proto.use_index, query_parallelism: proto.query_parallelism.unwrap_or(DEFAULT_QUERY_PARALLELISM), dist_q_c: proto.dist_q_c.unwrap_or(0.0), + approx_mode: approx_mode_from_proto(proto.approx_mode), }) } @@ -335,6 +353,7 @@ mod tests { use_index: true, query_parallelism: -1, dist_q_c: 0.42, + approx_mode: ApproxMode::Accurate, }; let proto = query_to_proto(&query).unwrap(); @@ -352,6 +371,7 @@ mod tests { assert_eq!(query.use_index, back.use_index); assert_eq!(query.query_parallelism, back.query_parallelism); assert_eq!(query.dist_q_c, back.dist_q_c); + assert_eq!(query.approx_mode, back.approx_mode); assert_eq!(query.key.len(), back.key.len()); assert_eq!(query.key.data_type(), back.key.data_type()); } @@ -373,12 +393,19 @@ mod tests { use_index: false, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let proto = query_to_proto(&query).unwrap(); let back = query_from_proto(proto).unwrap(); assert!(back.metric_type.is_none()); assert!(!back.use_index); + assert_eq!(back.approx_mode, ApproxMode::Normal); + + let mut proto = query_to_proto(&query).unwrap(); + proto.approx_mode = i32::MAX; + let back = query_from_proto(proto).unwrap(); + assert_eq!(back.approx_mode, ApproxMode::Normal); } async fn make_vector_dataset() -> (Arc, tempfile::TempDir) { @@ -444,6 +471,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let exec = ANNIvfPartitionExec::try_new( @@ -491,6 +519,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; // Use a TestMemoryExec as a mock input child (provides the KNN_PARTITION_SCHEMA) @@ -554,6 +583,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: ApproxMode::Normal, }; let input: Arc = TestMemoryExec::try_new_exec( diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 48bd9fb4196..c4c79dcee5e 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -1934,6 +1934,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), } } @@ -2693,6 +2694,7 @@ mod tests { use_index: true, query_parallelism: DEFAULT_QUERY_PARALLELISM, dist_q_c: 0.0, + approx_mode: Default::default(), }; async fn multivector_scoring( From 66ea533d4d87567afabde1dd37cab670e4e2ac3b Mon Sep 17 00:00:00 2001 From: Brendan Clement Date: Wed, 10 Jun 2026 09:57:17 -0700 Subject: [PATCH 074/177] feat: branch-aware table version ops in directory and rest namespaces (#7166) ### Description Branch-aware table version operations. Table version operations in DirectoryNamespace and RestNamespace now operate on a branch's own version chain when a branch is supplied. Branches are already a first-class ref in the Lance table format; this brings the namespace operations in line. An absent branch (or "main") behaves exactly as before. Branch lifecycle operations. Adds CreateTableBranch, ListTableBranches, and DeleteTableBranch to DirectoryNamespace and RestNamespace, exposed by the rest_adapter at POST /v1/table/{id}/branches/{create,list,delete}. These wrap Lance's first-class Dataset branch APIs and mirror the existing tag operations, so the three layers stay uniform. Managed versioning now supports branches end to end. Previously the namespace-managed commit store (LanceNamespaceExternalManifestStore) was bound to one branch at construction and ignored the base path handed to it, and a branch-opened managed dataset stayed rooted at the table root. Drive-by fix. batch_delete_table_versions constructed {version}.manifest to delete, which is V1-only naming and silently matched nothing under V2 (the default scheme). It now matches actual manifests via Lance's ManifestNamingScheme. ### Testing - Branch round-trips for all version operations in DirectoryNamespace. - Managed branch e2e (DirectoryNamespace and rest_adapter loopback): with_branch open and commit, data files under tree//data/, unmanaged interop read, cross-branch checkout at overlapping version numbers, branch tags via with_tag and Ref::Tag, non-latest fork; asserts managed_versioning survives the REST round trip. --- java/lance-jni/src/blocking_dataset.rs | 10 +- java/lance-jni/src/transaction.rs | 21 +- python/src/dataset.rs | 49 +- python/src/fragment.rs | 12 +- rust/lance-namespace-impls/src/dir.rs | 2507 ++++++++++++++++- rust/lance-namespace-impls/src/rest.rs | 71 +- .../lance-namespace-impls/src/rest_adapter.rs | 541 +++- rust/lance-namespace/src/error.rs | 26 +- rust/lance-namespace/src/namespace.rs | 53 +- rust/lance/src/dataset.rs | 114 +- rust/lance/src/dataset/branch_location.rs | 61 + rust/lance/src/dataset/builder.rs | 149 +- .../src/dataset/tests/dataset_versioning.rs | 139 + rust/lance/src/dataset/write/commit.rs | 8 +- .../lance/src/io/commit/namespace_manifest.rs | 44 +- 15 files changed, 3608 insertions(+), 197 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 5be47dd2c2f..1d06f3eed87 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -201,7 +201,8 @@ impl BlockingDataset { if namespace_client_managed_versioning && let (Some(namespace_client), Some(tid)) = (namespace, table_id) { - let external_store = LanceNamespaceExternalManifestStore::new(namespace_client, tid); + let external_store = + LanceNamespaceExternalManifestStore::for_table_uri(namespace_client, tid, uri)?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); @@ -689,8 +690,11 @@ fn create_dataset<'local>( if let Some((namespace, table_id)) = namespace_info { // Set up commit handler only if namespace manages versioning if namespace_client_managed_versioning { - let external_store = - LanceNamespaceExternalManifestStore::new(namespace.clone(), table_id.clone()); + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + namespace.clone(), + table_id.clone(), + &path_str, + )?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 6bc1948ae6a..4f899f56ff2 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -774,12 +774,18 @@ fn inner_commit_to_dataset<'local>( // Set namespace commit handler only if namespace_client_managed_versioning is true let namespace_info = extract_namespace_info(env, &namespace_obj, &table_id_obj)?; let commit_handler = if namespace_client_managed_versioning { - namespace_info.map(|(ns, tid)| { - let external_store = LanceNamespaceExternalManifestStore::new(ns, tid); - Arc::new(ExternalManifestCommitHandler { - external_manifest_store: Arc::new(external_store), - }) as Arc - }) + match namespace_info { + Some((ns, tid)) => { + // The store derives the branch a request targets from the base + // path it is handed, resolved against the table root. + let table_root = java_blocking_ds.inner.branch_location().find_main()?.path; + let external_store = LanceNamespaceExternalManifestStore::new(ns, tid, table_root); + Some(Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new(external_store), + }) as Arc) + } + None => None, + } } else { None }; @@ -1560,7 +1566,8 @@ fn inner_commit_to_uri<'local>( // Set namespace commit handler only if namespace_client_managed_versioning is true if namespace_client_managed_versioning && let Some((namespace_client, tid)) = namespace_info { - let external_store = LanceNamespaceExternalManifestStore::new(namespace_client, tid); + let external_store = + LanceNamespaceExternalManifestStore::for_table_uri(namespace_client, tid, &uri_str)?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 25e73a9412e..8bfa81aeae4 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -818,8 +818,14 @@ impl Dataset { // Set up commit handler only if namespace manages versioning if namespace_client_managed_versioning { - let external_store = - LanceNamespaceExternalManifestStore::new(ns_client, tid.clone()); + // The store derives the branch a request targets from the base + // path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + ns_client, + tid.clone(), + &uri, + ) + .infer_error()?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), @@ -2655,9 +2661,16 @@ impl Dataset { && let (Some(ns_client), Some(tid)) = (namespace_client, table_id) { // Create ExternalManifestCommitHandler from namespace client and table_id - // only when namespace manages versioning + // only when namespace manages versioning. The store derives the + // branch a request targets from the base path it is handed, + // resolved against the table root. let ns_client = extract_namespace_arc(ns_client.py(), ns_client)?; - let external_store = LanceNamespaceExternalManifestStore::new(ns_client, tid); + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + ns_client, + tid, + &dest.table_root_uri()?, + ) + .infer_error()?; Some(Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }) as Arc) @@ -3614,6 +3627,15 @@ impl PyWriteDest { Self::Uri(uri) => WriteDestination::Uri(uri), } } + + /// The table root uri of this destination (a branch dataset resolves to + /// its main location). Used to root the namespace manifest store. + pub fn table_root_uri(&self) -> PyResult { + match self { + Self::Dataset(ds) => Ok(ds.ds.branch_location().find_main().infer_error()?.uri), + Self::Uri(uri) => Ok(uri.to_string()), + } + } } #[derive(Debug, Clone, Copy, PartialEq)] @@ -3969,7 +3991,7 @@ pub fn write_dataset( dest: PyWriteDest, options: &Bound<'_, PyDict>, ) -> PyResult { - let params = get_write_params(options)?; + let params = get_write_params(options, &dest.table_root_uri()?)?; let py = options.py(); let ds = if reader.is_instance_of::() { let scanner: Scanner = reader.extract()?; @@ -4038,8 +4060,13 @@ fn get_dict_opt<'py, D: FromPyObjectOwned<'py>>( .transpose() } +/// `table_uri` is the destination table's root uri; it roots the namespace +/// manifest store when `namespace_client_managed_versioning` is requested. #[allow(deprecated)] -pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { +pub fn get_write_params( + options: &Bound<'_, PyDict>, + table_uri: &str, +) -> PyResult> { let params = if options.is_none() { None } else { @@ -4209,9 +4236,15 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), }); diff --git a/python/src/fragment.rs b/python/src/fragment.rs index 1da99492fac..e6060b1ac4e 100644 --- a/python/src/fragment.rs +++ b/python/src/fragment.rs @@ -23,7 +23,7 @@ use lance::Error; use lance::dataset::fragment::FileFragment as LanceFragment; use lance::dataset::scanner::ColumnOrdering; use lance::dataset::transaction::{Operation, Transaction}; -use lance::dataset::{InsertBuilder, NewColumnTransform}; +use lance::dataset::{InsertBuilder, NewColumnTransform, WriteParams}; use lance_core::datatypes::BlobHandling; use lance_io::utils::CachedFileSize; use lance_table::format::{ @@ -119,7 +119,7 @@ impl FileFragment { kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult> { let params = if let Some(kw_params) = kwargs { - get_write_params(kw_params)? + get_write_params(kw_params, dataset_uri)? } else { None }; @@ -435,10 +435,10 @@ fn do_write_fragments( ) -> PyResult { let batches = convert_reader(reader)?; - let params = kwargs - .and_then(|params| get_write_params(params).transpose()) - .transpose()? - .unwrap_or_default(); + let params = match kwargs { + Some(params) => get_write_params(params, &dest.table_root_uri()?)?.unwrap_or_default(), + None => WriteParams::default(), + }; rt().block_on( Some(reader.py()), diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index a8eb1416183..8637b6d61f2 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -15,6 +15,7 @@ use async_trait::async_trait; use bytes::Bytes; use futures::{StreamExt, TryStreamExt}; use lance::dataset::builder::DatasetBuilder; +use lance::dataset::refs::check_valid_branch; use lance::dataset::scanner::Scanner; use lance::dataset::statistics::DatasetStatisticsExt; use lance::dataset::transaction::{Operation, Transaction}; @@ -45,25 +46,27 @@ use std::sync::{Arc, Mutex}; use crate::context::DynamicContextProvider; use lance_namespace::models::{ AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, - BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, + BatchDeleteTableVersionsResponse, BranchContents as ModelBranchContents, CountTableRowsRequest, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, + CreateTableRequest, CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, - DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, - DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, - DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, - DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, - ExplainTableQueryPlanRequest, FragmentStats, FragmentSummary, GetTableStatsRequest, - GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, Identity, - IndexContent, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTableIndicesRequest, ListTableIndicesResponse, - ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, - ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, - MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, - QueryTableRequestColumns, QueryTableRequestVector, RestoreTableRequest, RestoreTableResponse, - TableExistsRequest, TableVersion, TagContents as ModelTagContents, + DeclareTableRequest, DeclareTableResponse, DeleteTableBranchRequest, DeleteTableBranchResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DescribeNamespaceRequest, + DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, + DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, + DescribeTableVersionResponse, DescribeTransactionRequest, DescribeTransactionResponse, + DropNamespaceRequest, DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, + DropTableRequest, DropTableResponse, ExplainTableQueryPlanRequest, FragmentStats, + FragmentSummary, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, + GetTableTagVersionResponse, Identity, IndexContent, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, + MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, + QueryTableRequest, QueryTableRequestColumns, QueryTableRequestVector, RestoreTableRequest, + RestoreTableResponse, TableExistsRequest, TableVersion, TagContents as ModelTagContents, UpdateTableSchemaMetadataRequest, UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; @@ -1062,6 +1065,44 @@ impl DirectoryNamespace { } } + /// Map lance-core ref errors from branch operations to namespace errors. + /// + /// `RefConflict` is intentionally not handled here: create-time duplicates are rejected by + /// the existence pre-check before `create_branch` runs, and delete maps its own `RefConflict` + /// (branch still has dependents) inline. + fn map_branch_error( + err: lance_core::Error, + branch: &str, + table_uri: &str, + ) -> lance_core::Error { + match err { + lance_core::Error::RefNotFound { .. } => NamespaceError::TableBranchNotFound { + message: format!("branch '{}' for table at '{}'", branch, table_uri), + } + .into(), + lance_core::Error::InvalidRef { message } => NamespaceError::InvalidInput { + message: format!("invalid branch '{}': {}", branch, message), + } + .into(), + lance_core::Error::VersionNotFound { message } => { + NamespaceError::TableVersionNotFound { + message: format!( + "source version for branch '{}' not found for table at '{}': {}", + branch, table_uri, message + ), + } + .into() + } + other => NamespaceError::Internal { + message: format!( + "branch operation failed for branch '{}' on table at '{}': {}", + branch, table_uri, other + ), + } + .into(), + } + } + async fn table_has_actual_manifests(&self, table_name: &str) -> Result { manifest::ManifestNamespace::path_has_actual_manifests( &self.object_store, @@ -1160,6 +1201,54 @@ impl DirectoryNamespace { ObjectStore::extract_path_from_uri(registry, uri) } + /// Normalize and validate a branch selector: `None`, empty, and `main` mean + /// the main branch; any other name is validated with lance's + /// `check_valid_branch` (lance skips this on the open path) so it cannot + /// escape the table root via `..`. + fn normalized_branch(branch: Option<&str>) -> Result> { + match branch.filter(|b| !b.is_empty() && *b != "main") { + Some(branch) => { + check_valid_branch(branch).map_err(|e| { + lance_core::Error::from(NamespaceError::InvalidInput { + message: format!("invalid branch name '{}': {}", branch, e), + }) + })?; + Ok(Some(branch)) + } + None => Ok(None), + } + } + + async fn open_validated_branch(&self, table_uri: &str, branch: &str) -> Result { + let dataset = self + .configured_builder(table_uri) + .with_branch(branch, None) + .load() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::TableNotFound { + message: format!( + "branch '{}' not found for table at '{}': {}", + branch, table_uri, e + ), + }) + })?; + dataset.branches().get(branch).await.map_err(|_| { + lance_core::Error::from(NamespaceError::TableNotFound { + message: format!("branch '{}' not found for table at '{}'", branch, table_uri), + }) + })?; + Ok(dataset) + } + + async fn resolve_branch_location(&self, table_uri: &str, branch: &str) -> Result { + Ok(self + .open_validated_branch(table_uri, branch) + .await? + .branch_location() + .uri) + } + fn validate_dir_only_properties( properties: Option<&HashMap>, operation: &str, @@ -1217,6 +1306,13 @@ impl DirectoryNamespace { Ok(dataset) } + /// Logical table version parsed from a manifest filename, or `None` for + /// non-manifest / detached entries. Delegates to lance's scheme detection so + /// version listing and deletion stay consistent with the on-disk format. + fn manifest_version_from_filename(filename: &str) -> Option { + ManifestNamingScheme::detect_scheme(filename)?.parse_version(filename) + } + async fn list_table_versions_from_storage( &self, table_uri: &str, @@ -1247,17 +1343,7 @@ impl DirectoryNamespace { .into_iter() .filter_map(|meta| { let filename = meta.location.filename()?; - let version_str = filename.strip_suffix(".manifest")?; - if version_str.starts_with('d') { - return None; - } - let file_version: u64 = version_str.parse().ok()?; - - let actual_version = if file_version > u64::MAX / 2 { - u64::MAX - file_version - } else { - file_version - }; + let actual_version = Self::manifest_version_from_filename(filename)?; Some(TableVersion { version: actual_version as i64, @@ -1525,6 +1611,19 @@ impl DirectoryNamespace { } } + /// Build a `DatasetBuilder` for `table_uri` with this namespace's storage + /// options and session applied. Callers add version/branch scoping. + fn configured_builder(&self, table_uri: &str) -> DatasetBuilder { + let mut builder = DatasetBuilder::from_uri(table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(sess) = &self.session { + builder = builder.with_session(sess.clone()); + } + builder + } + async fn load_dataset( &self, table_uri: &str, @@ -1543,13 +1642,7 @@ impl DirectoryNamespace { .into()); } - let mut builder = DatasetBuilder::from_uri(table_uri); - if let Some(opts) = &self.storage_options { - builder = builder.with_storage_options(opts.clone()); - } - if let Some(sess) = &self.session { - builder = builder.with_session(sess.clone()); - } + let builder = self.configured_builder(table_uri); let dataset = builder.load().await.map_err(|e| { lance_core::Error::from(NamespaceError::TableNotFound { @@ -2130,40 +2223,66 @@ impl DirectoryNamespace { &self, table_entries: &[TableDeleteEntry], best_effort: bool, + branch: Option<&str>, ) -> Result { let mut deleted_count = 0i64; for te in table_entries { let table_uri = self.resolve_table_location(&te.table_id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let table_path = self.object_store_path_from_uri(&table_uri)?; let versions_dir_path = table_path.clone().join(VERSIONS_DIR); - for (start, end) in &te.ranges { - for version in *start..=*end { - let version_path = versions_dir_path - .clone() - .join(format!("{}.manifest", version as u64)); - match self.object_store.inner.delete(&version_path).await { - Ok(_) => { - deleted_count += 1; - } - Err(object_store::Error::NotFound { .. }) => {} - Err(e) => { - if best_effort { - log::warn!( - "Failed to delete manifest file for version {} of table {:?}: {:?}", - version, - te.table_id, - e - ); - } else { - return Err(NamespaceError::Internal { - message: format!( - "Failed to delete version {} for table at '{}': {}", - version, table_uri, e - ), - } - .into()); + // Match listed files, not constructed names (`{version}.manifest` misses V2). + let manifest_metas: Vec<_> = self + .object_store + .read_dir_all(&versions_dir_path, None) + .try_collect() + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to list manifest files for table at '{}': {}", + table_uri, e + ), + }) + })?; + let location_by_version: HashMap = manifest_metas + .into_iter() + .filter_map(|meta| { + let version = Self::manifest_version_from_filename(meta.location.filename()?)?; + Some((version, meta.location)) + }) + .collect(); + + for (&v, version_path) in &location_by_version { + let vi = v as i64; + if !te.ranges.iter().any(|&(s, e)| vi >= s && (e < 0 || vi < e)) { + continue; + } + match self.object_store.inner.delete(version_path).await { + Ok(_) => { + deleted_count += 1; + } + Err(object_store::Error::NotFound { .. }) => {} + Err(e) => { + if best_effort { + log::warn!( + "Failed to delete manifest file for version {} of table {:?}: {:?}", + v, + te.table_id, + e + ); + } else { + return Err(NamespaceError::Internal { + message: format!( + "Failed to delete version {} for table at '{}': {}", + v, table_uri, e + ), } + .into()); } } } @@ -2807,8 +2926,11 @@ impl LanceNamespace for DirectoryNamespace { request: ListTableVersionsRequest, ) -> Result { self.record_op("list_table_versions"); - // When table_version_storage_enabled, query from __manifest - if self.table_version_storage_enabled + let branch = Self::normalized_branch(request.branch.as_deref())?; + // The manifest catalog has no branch concept, so a branch lists its own + // version chain from storage under its tree path instead. + if branch.is_none() + && self.table_version_storage_enabled && let Some(ref manifest_ns) = self.manifest_ns { let table_id = request.id.clone().unwrap_or_default(); @@ -2820,6 +2942,10 @@ impl LanceNamespace for DirectoryNamespace { // Fallback when table_version_storage is not enabled: list from _versions/ directory let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let want_descending = request.descending == Some(true); let table_versions = self .list_table_versions_from_storage(&table_uri, want_descending, request.limit) @@ -2836,7 +2962,12 @@ impl LanceNamespace for DirectoryNamespace { request: CreateTableVersionRequest, ) -> Result { self.record_op("create_table_version"); + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let staging_manifest_path = &request.manifest_path; let version = request.version as u64; @@ -2956,8 +3087,10 @@ impl LanceNamespace for DirectoryNamespace { ); } - // If table_version_storage_enabled is enabled, also record in __manifest (best-effort) - if self.table_version_storage_enabled + // Also record in __manifest (best-effort). Branches aren't tracked there, + // so for a branch the storage manifest above is the only record. + if branch.is_none() + && self.table_version_storage_enabled && let Some(ref manifest_ns) = self.manifest_ns { let table_id_str = @@ -3009,9 +3142,12 @@ impl LanceNamespace for DirectoryNamespace { request: DescribeTableVersionRequest, ) -> Result { self.record_op("describe_table_version"); + let branch = Self::normalized_branch(request.branch.as_deref())?; // When table_version_storage_enabled and a specific version is requested, - // query from __manifest to avoid opening the entire dataset - if self.table_version_storage_enabled + // query from __manifest to avoid opening the entire dataset. A branch has + // no manifest-catalog entry, so it resolves from storage instead. + if branch.is_none() + && self.table_version_storage_enabled && let (Some(manifest_ns), Some(version)) = (&self.manifest_ns, request.version) { let table_id = request.id.clone().unwrap_or_default(); @@ -3020,6 +3156,10 @@ impl LanceNamespace for DirectoryNamespace { // Fallback when table_version_storage is not enabled: inspect physical manifests directly. let table_uri = self.resolve_table_location(&request.id).await?; + let table_uri = match branch { + Some(b) => self.resolve_branch_location(&table_uri, b).await?, + None => table_uri, + }; let versions = self .list_table_versions_from_storage(&table_uri, true, None) .await?; @@ -3057,21 +3197,39 @@ impl LanceNamespace for DirectoryNamespace { request: BatchDeleteTableVersionsRequest, ) -> Result { self.record_op("batch_delete_table_versions"); + let branch = Self::normalized_branch(request.branch.as_deref())?; // Single-table mode: use `id` (from path parameter) + `ranges` to delete // versions from one table. let ranges: Vec<(i64, i64)> = request .ranges .iter() - .map(|r| { - let start = r.start_version; - let end = if r.end_version > 0 { - r.end_version + .map(|r| (r.start_version, r.end_version)) + .collect(); + + // Reject pathological bounded ranges up front: the manifest path below + // builds one id per version, so (0, i64::MAX) would exhaust memory. A + // through-latest range (end < 0) is bounded by the manifests that exist. + const MAX_VERSIONS_PER_REQUEST: i128 = 1_000_000; + let requested: i128 = ranges + .iter() + .map(|(s, e)| { + if *e < 0 { + 0 } else { - start - }; - (start, end) + (*e as i128 - *s as i128).max(0) + } }) - .collect(); + .sum(); + if requested > MAX_VERSIONS_PER_REQUEST { + return Err(NamespaceError::InvalidInput { + message: format!( + "batch_delete requested {} versions; limit is {}", + requested, MAX_VERSIONS_PER_REQUEST + ), + } + .into()); + } + let table_entries = vec![TableDeleteEntry { table_id: request.id.clone(), ranges, @@ -3079,9 +3237,28 @@ impl LanceNamespace for DirectoryNamespace { let mut total_deleted_count = 0i64; - if self.table_version_storage_enabled + // Branches are not tracked in the manifest catalog, so a branch skips the + // __manifest phase entirely and deletes its physical manifests directly. + if branch.is_none() + && self.table_version_storage_enabled && let Some(ref manifest_ns) = self.manifest_ns { + // Through-latest ranges (end_version < 0) would require enumerating the + // __manifest chain up to the latest version, which is not wired up here. + // Reject rather than silently delete physical files while leaving the + // __manifest records in place. + if table_entries + .iter() + .any(|te| te.ranges.iter().any(|&(_, e)| e < 0)) + { + return Err(NamespaceError::Unsupported { + message: "through-latest delete (end_version < 0) is not supported \ + for managed-versioning tables" + .to_string(), + } + .into()); + } + // Phase 1 (atomic commit point): Delete version records from __manifest // for ALL tables in a single atomic operation. This is the authoritative // source of truth — once __manifest entries are removed, the versions @@ -3094,7 +3271,7 @@ impl LanceNamespace for DirectoryNamespace { &te.table_id.clone().unwrap_or_default(), ); for (start, end) in &te.ranges { - for version in *start..=*end { + for version in *start..*end { let object_id = manifest::ManifestNamespace::build_version_object_id( &table_id_str, version, @@ -3115,7 +3292,7 @@ impl LanceNamespace for DirectoryNamespace { // __manifest, so they won't be visible to readers. Leftover files are // orphaned but harmless and can be cleaned up later. let _ = self - .delete_physical_version_files(&table_entries, true) + .delete_physical_version_files(&table_entries, true, branch) .await; return Ok(BatchDeleteTableVersionsResponse { @@ -3124,9 +3301,10 @@ impl LanceNamespace for DirectoryNamespace { }); } - // Fallback when table_version_storage is not enabled: delete physical files directly (no __manifest) + // Direct path: delete physical files (no __manifest). Reached when storage + // tracking is off, or for any branch (which has no __manifest entries). total_deleted_count = self - .delete_physical_version_files(&table_entries, false) + .delete_physical_version_files(&table_entries, false, branch) .await?; Ok(BatchDeleteTableVersionsResponse { @@ -3462,8 +3640,12 @@ impl LanceNamespace for DirectoryNamespace { )); } + let branch = Self::normalized_branch(request.branch.as_deref())?; let table_uri = self.resolve_table_location(&request.id).await?; - let mut dataset = self.load_dataset(&table_uri, None, "restore_table").await?; + let mut dataset = match branch { + Some(branch) => self.open_validated_branch(&table_uri, branch).await?, + None => self.load_dataset(&table_uri, None, "restore_table").await?, + }; dataset = dataset .checkout_version(version as u64) @@ -4273,6 +4455,156 @@ impl LanceNamespace for DirectoryNamespace { }) } + async fn create_table_branch( + &self, + request: CreateTableBranchRequest, + ) -> Result { + self.record_op("create_table_branch"); + if request.name.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "branch name must not be empty for create_table_branch".to_string(), + } + .into()); + } + let from_version = match request.from_version { + Some(v) if v <= 0 => { + return Err(NamespaceError::InvalidInput { + message: format!( + "from_version must be a positive integer, got {} for create_table_branch", + v + ), + } + .into()); + } + Some(v) => Some(v as u64), + None => None, + }; + + let table_uri = self.resolve_table_location(&request.id).await?; + let mut dataset = self + .load_dataset(&table_uri, None, "create_table_branch") + .await?; + + // Best-effort pre-check: a duplicate returns a clean TableBranchAlreadyExists conflict + // instead of the opaque Internal error create_branch raises on a pre-existing branch. A + // concurrent create can still race past this window. Remove once lance-core create_branch + // returns RefConflict up front. + if dataset.branches().get(&request.name).await.is_ok() { + return Err(NamespaceError::TableBranchAlreadyExists { + message: format!("branch '{}' for table at '{}'", request.name, table_uri), + } + .into()); + } + + dataset + .create_branch( + &request.name, + (request.from_branch.as_deref(), from_version), + None, + ) + .await + .map_err(|e| { + // After load_dataset + the dup pre-check, a DatasetNotFound from create_branch + // means the requested fork source (from_branch/from_version) doesn't exist. + if matches!(e, lance_core::Error::DatasetNotFound { .. }) { + NamespaceError::InvalidInput { + message: format!( + "from_branch/from_version for branch '{}' refers to a source that does not exist: {}", + request.name, e + ), + } + .into() + } else { + Self::map_branch_error(e, &request.name, &table_uri) + } + })?; + + Ok(CreateTableBranchResponse { + transaction_id: None, + }) + } + + async fn list_table_branches( + &self, + request: ListTableBranchesRequest, + ) -> Result { + self.record_op("list_table_branches"); + let table_uri = self.resolve_table_location(&request.id).await?; + let dataset = self + .load_dataset(&table_uri, None, "list_table_branches") + .await?; + + let raw_branches = dataset.list_branches().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!( + "Failed to list branches for table at '{}': {}", + table_uri, e + ), + }) + })?; + + let branches = raw_branches + .into_iter() + .map(|(name, contents)| { + // The namespace `BranchContents` model has no `identifier` field, so the + // lance-core branch identifier is intentionally dropped here. + let mut branch_model = ModelBranchContents::new( + contents.parent_version as i64, + contents.create_at as i64, + contents.manifest_size as i64, + ); + branch_model.parent_branch = contents.parent_branch; + branch_model.metadata = if contents.metadata.is_empty() { + None + } else { + Some(contents.metadata) + }; + (name, branch_model) + }) + .collect(); + + Ok(ListTableBranchesResponse { + branches, + page_token: None, + }) + } + + async fn delete_table_branch( + &self, + request: DeleteTableBranchRequest, + ) -> Result { + self.record_op("delete_table_branch"); + if request.name.is_empty() { + return Err(NamespaceError::InvalidInput { + message: "branch name must not be empty for delete_table_branch".to_string(), + } + .into()); + } + + let table_uri = self.resolve_table_location(&request.id).await?; + let mut dataset = self + .load_dataset(&table_uri, None, "delete_table_branch") + .await?; + + dataset + .delete_branch(&request.name) + .await + .map_err(|e| match e { + lance_core::Error::RefConflict { message } => NamespaceError::InvalidInput { + message: format!( + "branch '{}' for table at '{}': {}", + request.name, table_uri, message + ), + } + .into(), + other => Self::map_branch_error(other, &request.name, &table_uri), + })?; + + Ok(DeleteTableBranchResponse { + transaction_id: None, + }) + } + fn namespace_id(&self) -> String { format!("DirectoryNamespace {{ root: {:?} }}", self.root) } @@ -4287,6 +4619,7 @@ mod tests { use lance_core::utils::tempfile::{TempStdDir, TempStrDir}; use lance_core::utils::testing::CountingObjectStore; use lance_io::object_store::{providers::local::FileStoreProvider, uri_to_url}; + use lance_namespace::error::ErrorCode; use lance_namespace::models::{ CreateTableRequest, JsonArrowDataType, JsonArrowField, JsonArrowSchema, ListTablesRequest, QueryTableRequestColumns, @@ -4547,22 +4880,1610 @@ mod tests { Dataset::open(&table_uri).await.unwrap() } - async fn create_scalar_index( - namespace: &DirectoryNamespace, - table_name: &str, - index_name: &str, - ) -> Option { - use lance_namespace::models::CreateTableIndexRequest; + async fn create_scalar_index( + namespace: &DirectoryNamespace, + table_name: &str, + index_name: &str, + ) -> Option { + use lance_namespace::models::CreateTableIndexRequest; + + let mut create_index_request = + CreateTableIndexRequest::new("id".to_string(), "BTREE".to_string()); + create_index_request.id = Some(vec![table_name.to_string()]); + create_index_request.name = Some(index_name.to_string()); + namespace + .create_table_scalar_index(create_index_request) + .await + .unwrap() + .transaction_id + } + + /// Fork `branch_name` from the table's current version and append + /// `extra_versions` commits to it (each a new version on the branch, written + /// with the default V2 naming). The main branch is left untouched. Returns + /// the branch's storage URI (`/tree/`). + async fn create_branch_with_commits( + namespace: &DirectoryNamespace, + table_name: &str, + branch_name: &str, + extra_versions: usize, + ) -> String { + let mut main = open_dataset(namespace, table_name).await; + let fork_version = main.version().version; + let branch = main + .create_branch(branch_name, fork_version, None) + .await + .unwrap(); + let branch_uri = branch.uri().to_string(); + for i in 0..extra_versions { + append_scalar_version(&branch_uri, (i as i32 + 1) * 100).await; + } + branch_uri + } + + /// Append one scalar-schema batch to the dataset at `uri`, creating a new + /// version (default V2 naming). Shared by branch and main chain setup. + async fn append_scalar_version(uri: &str, seed: i32) { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![seed, seed + 1])), + Arc::new(StringArray::from(vec![Some("x"), Some("y")])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write( + reader, + uri, + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + } + + /// List a table's versions on `branch` (None == main) via the namespace. + async fn list_versions( + namespace: &DirectoryNamespace, + table_name: &str, + branch: Option<&str>, + ) -> Result> { + let req = ListTableVersionsRequest { + id: Some(vec![table_name.to_string()]), + branch: branch.map(|b| b.to_string()), + ..Default::default() + }; + namespace.list_table_versions(req).await.map(|r| r.versions) + } + + #[tokio::test] + async fn test_list_table_versions_on_branch() { + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + // The branch lists its own chain, and every version resolves to a + // manifest under the branch's tree path. + let branch_versions = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + assert!(branch_versions.len() >= 2); + assert!( + branch_versions + .iter() + .all(|v| v.manifest_path.contains("tree/exp")), + "branch versions must resolve to branch manifests: {:?}", + branch_versions + ); + + // Unset and "main" behave identically and never see the tree path. + let main_versions = list_versions(&namespace, "users", None).await.unwrap(); + let main_explicit = list_versions(&namespace, "users", Some("main")) + .await + .unwrap(); + assert_eq!(main_versions.len(), main_explicit.len()); + assert!( + main_versions + .iter() + .all(|v| !v.manifest_path.contains("tree/")) + ); + + // A non-existent branch is a clean not-found, not an empty list. + let missing = list_versions(&namespace, "users", Some("does-not-exist")).await; + assert!(missing.is_err()); + assert!(missing.unwrap_err().to_string().contains("not found")); + } + + #[tokio::test] + async fn test_describe_table_version_on_branch() { + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let branch_versions = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let latest = branch_versions.iter().map(|v| v.version).max().unwrap(); + + // Describe latest on the branch returns the branch's manifest_path. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.describe_table_version(req).await.unwrap(); + assert_eq!(resp.version.version, latest); + assert!(resp.version.manifest_path.contains("tree/exp")); + + // A specific existing branch version resolves. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: Some(latest), + branch: Some("exp".to_string()), + ..Default::default() + }; + assert!(namespace.describe_table_version(req).await.is_ok()); + + // A version absent on the branch is not found. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: Some(999_999), + branch: Some("exp".to_string()), + ..Default::default() + }; + assert!(namespace.describe_table_version(req).await.is_err()); + + // A non-existent branch is not found. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("nope".to_string()), + ..Default::default() + }; + let err = namespace.describe_table_version(req).await; + assert!(err.is_err() && err.unwrap_err().to_string().contains("not found")); + } + + #[tokio::test] + async fn test_restore_table_on_branch() { + use lance_namespace::models::RestoreTableRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let before = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let branch_latest = before.iter().map(|v| v.version).max().unwrap(); + let earliest = before.iter().map(|v| v.version).min().unwrap(); + let main_before = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + + // Restoring the branch to an earlier version commits a NEW version on + // the branch (restore is itself a commit), and must not touch main. + let req = RestoreTableRequest { + id: Some(vec!["users".to_string()]), + version: earliest, + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.restore_table(req).await.unwrap(); + assert!(resp.transaction_id.is_some()); + + let after = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let new_latest = after.iter().map(|v| v.version).max().unwrap(); + assert!( + new_latest > branch_latest, + "restore should add a branch version" + ); + + let main_after = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + assert_eq!(main_after, main_before, "main must be unaffected"); + } + + #[tokio::test] + async fn test_batch_delete_table_versions_on_branch() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let before = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + let main_before = list_versions(&namespace, "users", None).await.unwrap(); + + // Delete the branch's whole history with a through-latest range (end = -1). + // The branch manifests use V2 naming (inverted, zero-padded), so a nonzero + // deleted_count proves the V2 fix: the old code constructed + // "{version}.manifest" and silently matched nothing. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + branch: Some("exp".to_string()), + ranges: vec![VersionRange::new(0, -1)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some(before.len() as i64), + "every branch manifest should be physically deleted" + ); + + // The emptied branch now reads as not-found, and main is untouched. + assert!( + list_versions(&namespace, "users", Some("exp")) + .await + .is_err() + ); + let main_after = list_versions(&namespace, "users", None).await.unwrap(); + assert_eq!( + main_after.len(), + main_before.len(), + "main must be untouched" + ); + } + + #[tokio::test] + async fn test_create_table_version_on_branch() { + use futures::TryStreamExt; + use lance_namespace::models::CreateTableVersionRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + let branch_uri = create_branch_with_commits(&namespace, "users", "exp", 1).await; + + // Stage a manifest by copying one of the branch's existing manifests. + let branch_ds = Dataset::open(&branch_uri).await.unwrap(); + let versions_dir = branch_ds.versions_dir(); + let store = branch_ds.object_store(None).await.unwrap(); + let existing = store + .inner + .list(Some(&versions_dir)) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a branch manifest"); + let bytes = store + .inner + .get(&existing.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let staging = versions_dir.join("staging_manifest"); + store.inner.put(&staging, bytes.into()).await.unwrap(); + + let main_before = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + let new_version = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap() + .iter() + .map(|v| v.version) + .max() + .unwrap() + + 1; + + let req = CreateTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: new_version, + manifest_path: staging.to_string(), + naming_scheme: Some("V2".to_string()), + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.create_table_version(req).await.unwrap(); + let info = resp.version.expect("version info"); + // The new manifest must land under the branch's tree path. + assert!( + info.manifest_path.contains("tree/exp"), + "got {}", + info.manifest_path + ); + + // It is visible on the branch, and main did not gain a version. + let after = list_versions(&namespace, "users", Some("exp")) + .await + .unwrap(); + assert!(after.iter().any(|v| v.version == new_version)); + let main_after = list_versions(&namespace, "users", None) + .await + .unwrap() + .len(); + assert_eq!(main_after, main_before, "main must be unaffected"); + } + + /// The namespace-managed commit store derives the branch a request targets + /// from the base path it is handed, so a single store serves every branch of + /// the table: a branch-qualified base resolves and commits against the + /// branch chain while the table root targets main. + #[tokio::test] + async fn test_external_manifest_store_resolves_branch_from_base_path() { + use futures::TryStreamExt; + use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; + use lance_table::io::commit::external_manifest::ExternalManifestStore; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // main: version 1 + let branch_uri = create_branch_with_commits(&namespace, "users", "exp", 2).await; + + let namespace = Arc::new(namespace); + let table_id = vec!["users".to_string()]; + let branch_ds = Dataset::open(&branch_uri).await.unwrap(); + let branch_base = branch_ds.branch_location().path; + let root_base = branch_ds.branch_location().find_main().unwrap().path; + let store = LanceNamespaceExternalManifestStore::new( + namespace.clone(), + table_id.clone(), + root_base.clone(), + ); + + // The branch-qualified base resolves the branch chain, the root base + // resolves main: proof the base path reaches list_table_versions. + let (branch_latest, branch_path) = store + .get_latest_version(branch_base.as_ref()) + .await + .unwrap() + .expect("branch has versions"); + let (_main_latest, main_path) = store + .get_latest_version(root_base.as_ref()) + .await + .unwrap() + .expect("main has versions"); + assert!( + branch_path.contains("tree/exp"), + "branch latest must resolve to the branch tree: {}", + branch_path + ); + assert!( + !main_path.contains("tree/exp"), + "main latest must not resolve to a branch tree: {}", + main_path + ); + + // describe (get) with the branch base also resolves to the branch tree. + let described = store + .get(branch_base.as_ref(), branch_latest) + .await + .unwrap(); + assert!( + described.contains("tree/exp"), + "describe on the branch must resolve to the branch tree: {}", + described + ); + + // A base that is neither the root nor a branch chain is rejected. + assert!(store.get_latest_version("somewhere/else").await.is_err()); + + // Commit (put) with the branch base: the new version must land on the + // branch chain. Stage a manifest by copying an existing branch manifest. + let versions_dir = branch_ds.versions_dir(); + let obj = branch_ds.object_store(None).await.unwrap(); + let existing = obj + .inner + .list(Some(&versions_dir)) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a branch manifest"); + let bytes = obj + .inner + .get(&existing.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let size = bytes.len() as u64; + let staging = versions_dir.clone().join("staging_manifest"); + obj.inner.put(&staging, bytes.into()).await.unwrap(); + + let committed = store + .put( + &branch_base, + branch_latest + 1, + &staging, + size, + None, + obj.inner.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .unwrap(); + assert!( + committed.path.to_string().contains("tree/exp"), + "a commit through a branch-qualified base must land on the branch tree: {}", + committed.path + ); + } + + /// write_into_namespace_on_branch must append against the branch chain + /// THROUGH the managed commit handler: the version is registered with the + /// namespace (create_table_version), lands on the branch tree, and main's + /// catalog is untouched. The ops-metrics assertions exist because a + /// physical-only commit is invisible to DirectoryNamespace branch listing + /// (it lists storage), while a catalog-authoritative namespace would + /// silently lose the version. + #[tokio::test] + async fn test_write_into_namespace_on_branch_appends_to_branch() { + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let namespace = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .ops_metrics_enabled(true) + .build() + .await + .unwrap(), + ); + let ns: Arc = namespace.clone(); + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + let main_before = main_chain_len(ns.clone(), table_id.clone()).await; + let commits_before = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + + let branch_ds = Dataset::write_into_namespace_on_branch( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + ns.clone(), + table_id.clone(), + "exp", + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(branch_ds.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&branch_ds).await, vec![1, 2, 3]); + + // The append must commit through the namespace, not just write a + // physical manifest under the branch tree. + let commits_after = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + assert_eq!( + commits_after, + commits_before + 1, + "the branch append must register its version via create_table_version" + ); + let exp_versions = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + branch: Some("exp".to_string()), + ..Default::default() + }) + .await + .unwrap() + .versions; + assert!( + exp_versions + .iter() + .all(|v| v.manifest_path.contains("tree/exp")), + "branch versions must resolve to the branch tree: {:?}", + exp_versions + ); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before, + "main's catalog must be untouched by the branch append" + ); + + // A managed main append through the same entry point must register in + // the catalog too, so a fresh managed open resolves the new latest. + Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(single_int_batch(100))], single_int_schema()), + ns.clone(), + table_id.clone(), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before + 1, + "a managed main append must register its version in the catalog" + ); + let fresh = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!( + scan_id_column(&fresh).await, + vec![1, 2, 100], + "a fresh managed open must resolve the appended version, not a stale latest" + ); + } + + /// CREATE on a branch is rejected: a branch forks from an existing version. + #[tokio::test] + async fn test_write_into_namespace_on_branch_rejects_create() { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + + let (namespace, _temp_dir) = create_test_namespace().await; + let namespace = Arc::new(namespace); + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + ])); + let batch = arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(StringArray::from(vec![Some("a")])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let result = Dataset::write_into_namespace_on_branch( + reader, + namespace.clone(), + vec!["new_table".to_string()], + "exp", + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await; + assert!(result.is_err(), "create on a branch must be rejected"); + assert!( + result.unwrap_err().to_string().contains("branch"), + "error should mention the branch restriction" + ); + } + + #[tokio::test] + async fn test_branch_name_validation_rejects_traversal() { + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // A traversal-style branch name is rejected as invalid input before any + // storage path is built from it. + let err = list_versions(&namespace, "users", Some("../evil")).await; + assert!(err.is_err()); + assert!(err.unwrap_err().to_string().contains("invalid branch name")); + } + + #[tokio::test] + async fn test_branch_ops_reject_zombie_branch() { + use futures::TryStreamExt; + use lance_namespace::models::{ + BatchDeleteTableVersionsRequest, CreateTableVersionRequest, RestoreTableRequest, + VersionRange, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // Stage a real (loadable) manifest under tree/ghost/_versions/ without + // create_branch, so the path exists but has no BranchContents ref. + let dataset = open_dataset(&namespace, "users").await; + let store = dataset.object_store(None).await.unwrap(); + let manifest = store + .inner + .list(Some(&dataset.versions_dir())) + .try_collect::>() + .await + .unwrap() + .into_iter() + .find(|m| { + m.location + .filename() + .map(|f| f.ends_with(".manifest")) + .unwrap_or(false) + }) + .expect("a manifest"); + let bytes = store + .inner + .get(&manifest.location) + .await + .unwrap() + .bytes() + .await + .unwrap(); + let zombie = Path::from(format!( + "{}/tree/ghost/_versions/{}", + dataset.branch_location().path, + manifest.location.filename().unwrap() + )); + store.inner.put(&zombie, bytes.into()).await.unwrap(); + + // The directory is physically present, but the source of truth has no + // such branch -- this is what makes every op below reject it. + assert!(dataset.branches().get("ghost").await.is_err()); + + fn rejected(label: &str, r: Result) { + match r { + Ok(v) => panic!("{label} must reject the zombie branch, got Ok({v:?})"), + Err(e) => assert!(e.to_string().contains("not found"), "{label}: {e}"), + } + } + + rejected( + "list", + list_versions(&namespace, "users", Some("ghost")).await, + ); + rejected( + "describe", + namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "create", + namespace + .create_table_version(CreateTableVersionRequest { + id: Some(vec!["users".to_string()]), + version: 2, + manifest_path: zombie.to_string(), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "restore", + namespace + .restore_table(RestoreTableRequest { + id: Some(vec!["users".to_string()]), + version: 1, + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await, + ); + rejected( + "batch_delete", + namespace + .batch_delete_table_versions(BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + branch: Some("ghost".to_string()), + ranges: vec![VersionRange::new(1, 1)], + ..Default::default() + }) + .await, + ); + } + + /// V2 is the default naming scheme, and the pre-rewrite delete path + /// constructed `{version}.manifest` (a V1 name) and silently matched nothing + /// on a V2 table, returning deleted_count 0. This pins the fix on the main + /// chain (branch=None), which previously had no batch_delete coverage at all. + #[tokio::test] + async fn test_batch_delete_table_versions_main_v2() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // version 1 + let main_uri = open_dataset(&namespace, "users").await.uri().to_string(); + append_scalar_version(&main_uri, 100).await; // version 2 + append_scalar_version(&main_uri, 200).await; // version 3 + + let before = list_versions(&namespace, "users", None).await.unwrap(); + assert!(before.len() >= 3); + // Confirm these really are V2-named manifests (20-digit inverted version + // + ".manifest" == 29 chars), i.e. the case the old code skipped. + assert!( + before + .iter() + .all(|v| v.manifest_path.rsplit('/').next().unwrap().len() == 29), + "expected V2-named manifests: {:?}", + before + ); + let min_v = before.iter().map(|v| v.version).min().unwrap(); + let max_v = before.iter().map(|v| v.version).max().unwrap(); + + // Delete everything except the latest version. end is exclusive, so + // [min_v, max_v) keeps max_v. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(min_v, max_v)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some((before.len() - 1) as i64), + "V2 manifests must actually be deleted (was 0 before the fix)" + ); + + let after = list_versions(&namespace, "users", None).await.unwrap(); + assert_eq!(after.len(), 1); + assert_eq!(after[0].version, max_v); + } + + /// Pins the exclusive end of VersionRange: [v, v+1) must match only v. + #[tokio::test] + async fn test_batch_delete_end_is_exclusive() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // version 1 + let main_uri = open_dataset(&namespace, "users").await.uri().to_string(); + append_scalar_version(&main_uri, 100).await; // version 2 + append_scalar_version(&main_uri, 200).await; // version 3 + + let before = list_versions(&namespace, "users", None).await.unwrap(); + let min_v = before.iter().map(|v| v.version).min().unwrap(); + + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(min_v, min_v + 1)], + ..Default::default() + }; + let resp = namespace.batch_delete_table_versions(req).await.unwrap(); + assert_eq!( + resp.deleted_count, + Some(1), + "only min_v is in [min_v, min_v+1)" + ); + + let after = list_versions(&namespace, "users", None).await.unwrap(); + assert!( + !after.iter().any(|v| v.version == min_v), + "min_v must be deleted" + ); + assert_eq!(after.len(), before.len() - 1, "exactly one version removed"); + } + + #[tokio::test] + async fn test_batch_delete_rejects_unbounded_range() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; + + // An unbounded range must be rejected up front, not turned into ~10^19 + // iterations / an unbounded id list. + let req = BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(0, i64::MAX)], + ..Default::default() + }; + let err = namespace.batch_delete_table_versions(req).await; + assert!(err.is_err()); + assert!( + err.unwrap_err().to_string().contains("limit"), + "expected a range-too-large error" + ); + } + + /// The managed `__manifest` delete path (the authoritative catalog) must honor + /// the exclusive end: `[min, max)` removes exactly min..max from `__manifest`, + /// keeping max. With storage tracking on, the writes register versions in + /// `__manifest` and `list_table_versions` reads it back, so this exercises the + /// Phase-1 path that the physical-path tests never reach. + #[tokio::test] + async fn test_batch_delete_managed_manifest_exclusive() { + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let temp = TempStdDir::default(); + let ns: Arc = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .build() + .await + .unwrap(), + ); + let table_id = vec!["users".to_string()]; + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])); + let batch = |seed: i32| { + arrow::record_batch::RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![seed]))], + ) + .unwrap() + }; + + // Register v1, v2, v3 in __manifest via the managed write flow. + let mut ds = Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()), + ns.clone(), + table_id.clone(), + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await + .unwrap(); + ds.append( + RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()), + None, + ) + .await + .unwrap(); + ds.append( + RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()), + None, + ) + .await + .unwrap(); + + let before = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions; + assert!( + before.len() >= 3, + "expected v1..v3 tracked in __manifest: {:?}", + before + ); + let min_v = before.iter().map(|v| v.version).min().unwrap(); + let max_v = before.iter().map(|v| v.version).max().unwrap(); + + // [min, max): exclusive end keeps max. + ns.batch_delete_table_versions(BatchDeleteTableVersionsRequest { + id: Some(table_id.clone()), + ranges: vec![VersionRange::new(min_v, max_v)], + ..Default::default() + }) + .await + .unwrap(); + + let after = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions; + assert_eq!( + after.len(), + 1, + "only the exclusive end (max) should remain in __manifest: {:?}", + after + ); + assert_eq!(after[0].version, max_v, "max must be kept"); + } + + /// On the managed path, a through-latest delete (`end_version < 0`) is rejected + /// rather than silently deleting physical files while leaving `__manifest` + /// records in place. + #[tokio::test] + async fn test_batch_delete_managed_rejects_through_latest() { + use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; + + let temp = TempStdDir::default(); + let ns: Arc = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .build() + .await + .unwrap(), + ); + + let err = ns + .batch_delete_table_versions(BatchDeleteTableVersionsRequest { + id: Some(vec!["users".to_string()]), + ranges: vec![VersionRange::new(0, -1)], + ..Default::default() + }) + .await; + assert!( + err.is_err(), + "through-latest delete must be rejected on the managed path" + ); + assert!( + err.unwrap_err().to_string().contains("not supported"), + "expected a not-supported error" + ); + } + + /// Build a managed (manifest-tracked) namespace over `path`. + async fn create_managed_namespace(path: &str) -> Arc { + Arc::new( + DirectoryNamespaceBuilder::new(path) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .build() + .await + .unwrap(), + ) + } + + fn single_int_schema() -> Arc { + use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])) + } + + fn single_int_batch(seed: i32) -> arrow::record_batch::RecordBatch { + use arrow::array::Int32Array; + arrow::record_batch::RecordBatch::try_new( + single_int_schema(), + vec![Arc::new(Int32Array::from(vec![seed]))], + ) + .unwrap() + } + + /// Create a managed table with versions v1 (id=1) and v2 (id=2) on main and + /// return the main dataset handle. + async fn create_managed_table(ns: &Arc, table_id: &[String]) -> Dataset { + let mut ds = Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(single_int_batch(1))], single_int_schema()), + ns.clone(), + table_id.to_vec(), + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await + .unwrap(); + ds.append( + RecordBatchIterator::new(vec![Ok(single_int_batch(2))], single_int_schema()), + None, + ) + .await + .unwrap(); + ds + } + + /// Sorted values of the `id` column across a full scan. + async fn scan_id_column(ds: &Dataset) -> Vec { + use arrow::array::Int32Array; + use futures::TryStreamExt; + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort(); + ids + } + + /// E2e for the managed branch path through the builder: create a branch via the + /// namespace op, open it with `from_namespace(managed).with_branch`, commit on + /// it, and confirm the dataset is rooted at the branch chain (manifest, base + /// path and data placement) while main's catalog is untouched. + #[tokio::test] + async fn test_managed_branch_open_and_commit() { + use futures::TryStreamExt; + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; + let main_before = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions + .len(); + + // Create a branch via the namespace op (the FS-handler path, which succeeds + // on a managed table). + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + // Open the managed table on the branch: the base path is qualified up + // front and the manifest store derives the branch from it. + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!( + branch_ds.manifest.branch.as_deref(), + Some("exp"), + "with_branch on a managed table must open the branch chain" + ); + let branch_base = branch_ds.branch_location().path; + assert!( + branch_base.as_ref().ends_with("tree/exp"), + "the branch dataset must be rooted at the branch chain: {}", + branch_base + ); + let branch_v_before = branch_ds.version().version; + + // Commit on the branch. + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + assert_eq!( + branch_ds.manifest.branch.as_deref(), + Some("exp"), + "the commit must stay on the branch" + ); + assert!( + branch_ds.version().version > branch_v_before, + "the branch version must advance after the commit" + ); + assert_eq!(scan_id_column(&branch_ds).await, vec![1, 2, 3]); + + // The committed data files live under the branch chain, not main's data + // dir, so unmanaged readers of the branch and main's cleanup see a + // consistent layout. + let store = branch_ds.object_store(None).await.unwrap(); + let branch_data = branch_base.clone().join("data"); + let branch_files = store + .inner + .list(Some(&branch_data)) + .try_collect::>() + .await + .unwrap(); + assert!( + !branch_files.is_empty(), + "the branch commit must place data files under the branch chain" + ); + + // The same branch is readable through the unmanaged (path-based) open. + let table_uri = ns + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .location + .unwrap(); + let fs_branch_ds = DatasetBuilder::from_uri(&table_uri) + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!(fs_branch_ds.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&fs_branch_ds).await, vec![1, 2, 3]); + + // Main's catalog is untouched (branches are not tracked in __manifest), + // and main still reads its own data. + let main_after = ns + .list_table_versions(ListTableVersionsRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .versions + .len(); + assert_eq!( + main_after, main_before, + "committing on the branch must not change main's chain" + ); + let main_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!(main_ds.manifest.branch, None); + assert_eq!(scan_id_column(&main_ds).await, vec![1, 2]); + } + + /// Branch-pointing tags on a managed table: create them through the normal + /// API (from both the main and the branch handle), open the table at the + /// tag, and check the tag out from an already-open dataset. All of these + /// must resolve the branch chain, never main's chain. + #[tokio::test] + async fn test_managed_branch_tags() { + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + let main_ds = create_managed_table(&ns, &table_id).await; + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + let branch_version = branch_ds.version().version; + + // A branch-pointing tag created from the main handle must validate + // against the branch chain (the version does not exist on main). + main_ds + .tags() + .create("exp-tag", ("exp", Some(branch_version))) + .await + .unwrap(); + let tag = main_ds.tags().get("exp-tag").await.unwrap(); + assert_eq!(tag.branch.as_deref(), Some("exp")); + assert_eq!(tag.version, branch_version); + + // A tag created from the branch handle resolves the branch implicitly. + branch_ds + .tags() + .create("exp-tag2", branch_version) + .await + .unwrap(); + let tag2 = branch_ds.tags().get("exp-tag2").await.unwrap(); + assert_eq!(tag2.branch.as_deref(), Some("exp")); + + // Opening the managed table at the branch-pointing tag checks out the + // branch chain. + let tag_open = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("exp-tag") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest.branch.as_deref(), Some("exp")); + assert_eq!(tag_open.version().version, branch_version); + assert_eq!(scan_id_column(&tag_open).await, vec![1, 2, 3]); + + // So does checking the tag out from an already-open main dataset. + let tag_checkout = main_ds + .checkout_version(Ref::Tag("exp-tag".to_string())) + .await + .unwrap(); + assert_eq!(tag_checkout.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&tag_checkout).await, vec![1, 2, 3]); + + // A missing tag on a managed table errors at open. + let err = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("no-such-tag") + .load() + .await; + assert!(err.is_err(), "a missing tag must error"); + } + + /// Cross-branch checkout on a managed table, including version numbers that + /// exist on both chains (branch numbering continues from the fork point, so + /// overlap is the common case). Every checkout must land on the requested + /// chain and read that chain's data. + #[tokio::test] + async fn test_managed_cross_branch_checkout() { + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + let mut main_ds = create_managed_table(&ns, &table_id).await; + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + // exp gets id=3 at its tip; main gets id=100 at the same version number. + let mut branch_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(3))], single_int_schema()), + None, + ) + .await + .unwrap(); + let overlap_version = branch_ds.version().version; + while main_ds.version().version < overlap_version { + main_ds + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(100))], single_int_schema()), + None, + ) + .await + .unwrap(); + } + + // main -> branch at the overlapping version number: must read the + // branch's data, not main's same-numbered version. + let on_branch = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(overlap_version))) + .await + .unwrap(); + assert_eq!(on_branch.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&on_branch).await, vec![1, 2, 3]); + + // main -> branch latest. + let mut on_branch_latest = main_ds.checkout_branch("exp").await.unwrap(); + assert_eq!(on_branch_latest.manifest.branch.as_deref(), Some("exp")); + assert_eq!(on_branch_latest.version().version, overlap_version); + + // A commit through the checked-out handle (which shares main's commit + // handler) must land on the branch chain, not main's. + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + let main_before = main_chain_len(ns.clone(), table_id.clone()).await; + on_branch_latest + .append( + RecordBatchIterator::new(vec![Ok(single_int_batch(4))], single_int_schema()), + None, + ) + .await + .unwrap(); + assert_eq!(on_branch_latest.manifest.branch.as_deref(), Some("exp")); + assert_eq!(scan_id_column(&on_branch_latest).await, vec![1, 2, 3, 4]); + assert_eq!( + main_chain_len(ns.clone(), table_id.clone()).await, + main_before, + "a commit on the checked-out branch must not advance main's chain" + ); + + // branch -> main at a specific version. + let on_main = branch_ds + .checkout_version(Ref::Version(None, Some(1))) + .await + .unwrap(); + assert_eq!(on_main.manifest.branch, None); + assert_eq!(scan_id_column(&on_main).await, vec![1]); + + // branch -> another branch. + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp2".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let on_branch2 = branch_ds.checkout_branch("exp2").await.unwrap(); + assert_eq!(on_branch2.manifest.branch.as_deref(), Some("exp2")); + + // A version missing from the branch chain errors loudly. + let err = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(999))) + .await; + assert!(err.is_err(), "a version missing from the branch must error"); + } + + /// CommitBuilder must honor an explicitly supplied commit handler for a + /// Dataset destination: a managed-versioning commit through a dataset that + /// was opened without the namespace handler (as the Java and Python commit + /// APIs allow) must still register with the catalog instead of silently + /// writing a physical manifest the catalog never sees. + #[tokio::test] + async fn test_commit_builder_honors_explicit_handler_for_dataset_dest() { + use lance::dataset::write::{CommitBuilder, InsertBuilder}; + use lance::dataset::{WriteDestination, builder::DatasetBuilder}; + use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; + use lance_table::io::commit::external_manifest::ExternalManifestCommitHandler; + + let temp = TempStdDir::default(); + let namespace = Arc::new( + DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) + .manifest_enabled(true) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .ops_metrics_enabled(true) + .build() + .await + .unwrap(), + ); + let ns: Arc = namespace.clone(); + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + + // Open WITHOUT the namespace handler, the way a binding caller can. + let table_uri = ns + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap() + .location + .unwrap(); + let plain_ds = Arc::new(Dataset::open(&table_uri).await.unwrap()); + + let transaction = InsertBuilder::new(WriteDestination::Dataset(plain_ds.clone())) + .with_params(&WriteParams { + mode: WriteMode::Append, + ..Default::default() + }) + .execute_uncommitted(vec![single_int_batch(3)]) + .await + .unwrap(); + + let handler = Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new( + LanceNamespaceExternalManifestStore::for_table_uri( + ns.clone(), + table_id.clone(), + &table_uri, + ) + .unwrap(), + ), + }); + let commits_before = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + let committed = CommitBuilder::new(WriteDestination::Dataset(plain_ds)) + .with_commit_handler(handler) + .execute(transaction) + .await + .unwrap(); + assert_eq!(scan_id_column(&committed).await, vec![1, 2, 3]); + + let commits_after = namespace + .retrieve_ops_metrics() + .get("create_table_version") + .copied() + .unwrap_or(0); + assert_eq!( + commits_after, + commits_before + 1, + "the explicit handler must route the commit through create_table_version" + ); + let fresh = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .load() + .await + .unwrap(); + assert_eq!( + scan_id_column(&fresh).await, + vec![1, 2, 3], + "a fresh managed open must resolve the committed version" + ); + } + + /// A branch forked from a non-latest version opens on its own chain. + #[tokio::test] + async fn test_managed_branch_from_non_latest_fork() { + use lance::dataset::builder::DatasetBuilder; + use lance_namespace::models::CreateTableBranchRequest; + + let temp = TempStdDir::default(); + let ns = create_managed_namespace(temp.to_str().unwrap()).await; + let table_id = vec!["t".to_string()]; + create_managed_table(&ns, &table_id).await; // main: v1 (id=1), v2 (id=2) + + ns.create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "old".to_string(), + from_version: Some(1), + ..Default::default() + }) + .await + .unwrap(); + + let old_ds = DatasetBuilder::from_namespace(ns.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("old", None) + .load() + .await + .unwrap(); + assert_eq!(old_ds.manifest.branch.as_deref(), Some("old")); + assert_eq!( + scan_id_column(&old_ds).await, + vec![1], + "the fork must contain only the fork-point data" + ); + } + + /// The shared parser must decode both naming schemes; this is the cheap + /// V1 no-regression guard (creating a real V1 table is not exposed here). + #[test] + fn test_manifest_version_from_filename() { + // V1: the plain version number. + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("5.manifest"), + Some(5) + ); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("0.manifest"), + Some(0) + ); + // V2: version stored as u64::MAX - version, zero-padded to 20 digits. + let v2_five = format!("{:020}.manifest", u64::MAX - 5); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename(&v2_five), + Some(5) + ); + let v2_zero = format!("{:020}.manifest", u64::MAX); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename(&v2_zero), + Some(0) + ); + // Non-manifest and detached (`d`-prefixed) entries are ignored. + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("data.lance"), + None + ); + assert_eq!( + DirectoryNamespace::manifest_version_from_filename("d5.manifest"), + None + ); + } + + /// With the manifest store enabled, branch ops must still bypass the catalog + /// fast-path and read the chain from `tree//_versions/`. Without the + /// `branch.is_none()` guard this would query `__manifest` (which has no + /// branch entries) and return the wrong result. The other branch tests use a + /// store-disabled namespace, so this pins the enabled path specifically. + #[tokio::test] + async fn test_branch_ops_skip_manifest_store_when_enabled() { + let temp_dir = TempStdDir::default(); + let namespace = DirectoryNamespaceBuilder::new(temp_dir.to_str().unwrap()) + .manifest_enabled(true) + .table_version_storage_enabled(true) + .build() + .await + .unwrap(); - let mut create_index_request = - CreateTableIndexRequest::new("id".to_string(), "BTREE".to_string()); - create_index_request.id = Some(vec![table_name.to_string()]); - create_index_request.name = Some(index_name.to_string()); - namespace - .create_table_scalar_index(create_index_request) + create_scalar_table(&namespace, "users").await; + create_branch_with_commits(&namespace, "users", "exp", 2).await; + + // list resolves the branch chain from storage despite storage tracking + // being on (a successful result with tree/exp paths proves the bypass: + // the catalog has no "exp" entry, so the fast-path would not return these). + let branch_versions = list_versions(&namespace, "users", Some("exp")) .await - .unwrap() - .transaction_id + .unwrap(); + assert!(branch_versions.len() >= 2); + assert!( + branch_versions + .iter() + .all(|v| v.manifest_path.contains("tree/exp")), + "branch versions must come from branch storage with the store enabled: {:?}", + branch_versions + ); + + // describe likewise resolves from the branch's storage. + let req = DescribeTableVersionRequest { + id: Some(vec!["users".to_string()]), + branch: Some("exp".to_string()), + ..Default::default() + }; + let resp = namespace.describe_table_version(req).await.unwrap(); + assert!(resp.version.manifest_path.contains("tree/exp")); } #[tokio::test] @@ -9908,6 +11829,400 @@ mod tests { (namespace, temp_dir, table_id) } + /// Downcast a lance-core error to its NamespaceError code for precise assertions. + fn namespace_code(err: &Error) -> Option { + match err { + Error::Namespace { source, .. } => { + source.downcast_ref::().map(|e| e.code()) + } + _ => None, + } + } + + #[tokio::test] + async fn test_create_and_list_branches() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "staging".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + resp.branches.len(), + 2, + "expected 2 branches, got: {:?}", + resp.branches + ); + assert!(resp.branches.contains_key("dev")); + assert!(resp.branches.contains_key("staging")); + assert!(resp.page_token.is_none()); + + // Deleting one branch is reflected in a subsequent list. + namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!(resp.branches.len(), 1, "expected 1 branch after delete"); + assert!(!resp.branches.contains_key("dev")); + assert!(resp.branches.contains_key("staging")); + } + + #[tokio::test] + async fn test_create_branch_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; + + // Fork explicitly from version 1 of main. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "from-v1".to_string(), + from_version: Some(1), + ..Default::default() + }) + .await + .unwrap(); + + let resp = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap(); + let branch = resp + .branches + .get("from-v1") + .expect("forked branch should be listed"); + assert_eq!( + branch.parent_version, 1, + "branch should fork from version 1" + ); + assert!( + branch.parent_branch.is_none(), + "a branch forked from main has no parent branch" + ); + } + + /// Forking from a NON-main source branch must clone that branch's chain. + /// Both chains are given a version 2 with diverged content, so a clone that + /// wrongly resolves the version under main succeeds silently with main's + /// data instead of erroring. + #[tokio::test] + async fn test_create_branch_from_other_branch() { + use lance::dataset::builder::DatasetBuilder; + + let (namespace, _temp_dir) = create_test_namespace().await; + create_scalar_table(&namespace, "users").await; // main v1: ids [1, 2, 3] + // dev: forked at v1, one append (ids 100, 101) -> dev v2 + create_branch_with_commits(&namespace, "users", "dev", 1).await; + // Diverge main to the same version number with different content. + let main_ds = open_dataset(&namespace, "users").await; + append_scalar_version(main_ds.uri(), 500).await; // main v2: + ids [500, 501] + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(vec!["users".to_string()]), + name: "child".to_string(), + from_branch: Some("dev".to_string()), + from_version: Some(2), + ..Default::default() + }) + .await + .unwrap(); + + let child_ds = DatasetBuilder::from_uri(main_ds.uri()) + .with_branch("child", None) + .load() + .await + .unwrap(); + let ids = scan_id_column(&child_ds).await; + assert!( + ids.contains(&100) && ids.contains(&101), + "child must contain dev's appended rows, got: {:?}", + ids + ); + assert!( + !ids.contains(&500), + "child must not contain main's diverged rows, got: {:?}", + ids + ); + + // The recorded metadata and the cloned data must agree on the parent. + let listed = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(vec!["users".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + listed + .branches + .get("child") + .unwrap() + .parent_branch + .as_deref(), + Some("dev") + ); + } + + #[tokio::test] + async fn test_create_existing_branch_conflict() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::TableBranchAlreadyExists), + "expected TableBranchAlreadyExists, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("already exists"), + "expected already-exists message, got: {}", + err + ); + } + + #[tokio::test] + async fn test_delete_unknown_branch() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: "does-not-exist".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::TableBranchNotFound), + "expected TableBranchNotFound, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("not found"), + "expected not-found message, got: {}", + err + ); + } + + #[tokio::test] + async fn test_delete_referenced_branch_conflict() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + // A child forked from `parent` (via from_branch) makes `parent` a referenced branch. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "parent".to_string(), + ..Default::default() + }) + .await + .unwrap(); + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "child".to_string(), + from_branch: Some("parent".to_string()), + ..Default::default() + }) + .await + .unwrap(); + + // from_branch resolution: the child records its parent branch as its fork point. + let listed = namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + let child = listed + .branches + .get("child") + .expect("child branch should be listed"); + assert_eq!( + child.parent_branch.as_deref(), + Some("parent"), + "child should record parent branch as its fork point" + ); + assert!( + child.parent_version >= 1, + "child should record the parent version it forked from, got {}", + child.parent_version + ); + + // Deleting a branch that still has dependents is refused. The delete spec has no 409, + // so it surfaces as a documented InvalidInput (400), not a conflict status. + let err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: "parent".to_string(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "expected InvalidInput for deleting a referenced branch, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("referenced"), + "error should explain the branch is still referenced, got: {}", + err + ); + } + + #[tokio::test] + async fn test_branch_name_required() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let create_err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: String::new(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&create_err), + Some(ErrorCode::InvalidInput), + "empty name on create should be InvalidInput, got: {}", + create_err + ); + assert!( + create_err + .to_string() + .to_lowercase() + .contains("must not be empty") + ); + + let delete_err = namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(table_id), + name: String::new(), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&delete_err), + Some(ErrorCode::InvalidInput), + "empty name on delete should be InvalidInput, got: {}", + delete_err + ); + assert!( + delete_err + .to_string() + .to_lowercase() + .contains("must not be empty") + ); + } + + #[tokio::test] + async fn test_create_branch_rejects_negative_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + from_version: Some(-1), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "negative from_version should be InvalidInput, got: {}", + err + ); + assert!(err.to_string().to_lowercase().contains("from_version")); + } + + #[tokio::test] + async fn test_create_branch_nonexistent_from_version() { + let (namespace, _temp_dir, table_id) = create_tagged_test_table(2).await; + + // Version 999 does not exist (the table has 2 versions). create_branch's clone phase + // raises DatasetNotFound, which we map to a documented InvalidInput (400). + let err = namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id), + name: "dev".to_string(), + from_version: Some(999), + ..Default::default() + }) + .await + .unwrap_err(); + assert_eq!( + namespace_code(&err), + Some(ErrorCode::InvalidInput), + "non-existent from_version should map to InvalidInput, got: {}", + err + ); + assert!( + err.to_string().to_lowercase().contains("does not exist"), + "error should name the missing source, got: {}", + err + ); + } + #[tokio::test] async fn test_create_and_list_tags() { let (namespace, _temp_dir, table_id) = create_tagged_test_table(3).await; diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 27a563d2807..c245a1e6dc1 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -23,11 +23,13 @@ use lance_namespace::models::{ AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateMaterializedViewRequest, CreateMaterializedViewResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, - CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, - DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + CreateNamespaceResponse, CreateTableBranchRequest, CreateTableBranchResponse, + CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + CreateTableVersionRequest, CreateTableVersionResponse, DeclareTableRequest, + DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, DeleteTableBranchResponse, DeleteTableTagRequest, + DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, @@ -36,7 +38,8 @@ use lance_namespace::models::{ ErrorResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, RefreshMaterializedViewRequest, RefreshMaterializedViewResponse, @@ -1294,6 +1297,13 @@ impl LanceNamespace for RestNamespace { descending_str = descending.to_string(); query.push(("descending", descending_str.as_str())); } + // Forward branch as a query param (this op sends no body). + // describe_table_version differs: branch rides its body, already serialized. + let branch_str; + if let Some(ref branch) = request.branch { + branch_str = branch.clone(); + query.push(("branch", branch_str.as_str())); + } self.post_json(&path, &query, &(), "list_table_versions", &id) .await } @@ -1553,6 +1563,55 @@ impl LanceNamespace for RestNamespace { .await } + async fn create_table_branch( + &self, + request: CreateTableBranchRequest, + ) -> Result { + self.record_op("create_table_branch"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_branch", &id) + .await + } + + async fn list_table_branches( + &self, + request: ListTableBranchesRequest, + ) -> Result { + self.record_op("list_table_branches"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.post_json(&path, &query, &request, "list_table_branches", &id) + .await + } + + async fn delete_table_branch( + &self, + request: DeleteTableBranchRequest, + ) -> Result { + self.record_op("delete_table_branch"); + let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/branches/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_table_branch", &id) + .await + } + fn namespace_id(&self) -> String { format!( "RestNamespace {{ endpoint: {:?}, delimiter: {:?} }}", diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 6a3875ebf29..7324ab0bb0e 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -141,6 +141,10 @@ impl RestAdapter { .route("/v1/table/:id/tags/create", post(create_table_tag)) .route("/v1/table/:id/tags/delete", post(delete_table_tag)) .route("/v1/table/:id/tags/update", post(update_table_tag)) + // Branch operations + .route("/v1/table/:id/branches/create", post(create_table_branch)) + .route("/v1/table/:id/branches/list", post(list_table_branches)) + .route("/v1/table/:id/branches/delete", post(delete_table_branch)) // Query plan operations .route("/v1/table/:id/explain_plan", post(explain_table_query_plan)) .route("/v1/table/:id/analyze_plan", post(analyze_table_query_plan)) @@ -302,6 +306,7 @@ struct PaginationQuery { limit: Option, include_declared: Option, descending: Option, + branch: Option, } #[derive(Debug, Deserialize)] @@ -325,11 +330,13 @@ fn error_code_to_status(code: u32) -> StatusCode { | Some(lance_namespace::error::ErrorCode::TableTagNotFound) | Some(lance_namespace::error::ErrorCode::TransactionNotFound) | Some(lance_namespace::error::ErrorCode::TableVersionNotFound) - | Some(lance_namespace::error::ErrorCode::TableColumnNotFound) => StatusCode::NOT_FOUND, + | Some(lance_namespace::error::ErrorCode::TableColumnNotFound) + | Some(lance_namespace::error::ErrorCode::TableBranchNotFound) => StatusCode::NOT_FOUND, Some(lance_namespace::error::ErrorCode::NamespaceAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableIndexAlreadyExists) | Some(lance_namespace::error::ErrorCode::TableTagAlreadyExists) + | Some(lance_namespace::error::ErrorCode::TableBranchAlreadyExists) | Some(lance_namespace::error::ErrorCode::ConcurrentModification) => StatusCode::CONFLICT, Some(lance_namespace::error::ErrorCode::NamespaceNotEmpty) | Some(lance_namespace::error::ErrorCode::InvalidTableState) => StatusCode::CONFLICT, @@ -847,6 +854,7 @@ async fn list_table_versions( page_token: params.page_token, limit: params.limit, descending: params.descending, + branch: params.branch, identity: extract_identity(&headers), ..Default::default() }; @@ -872,6 +880,7 @@ async fn create_table_version( manifest_size: body.manifest_size, e_tag: body.e_tag, metadata: body.metadata, + branch: body.branch, ..Default::default() }; @@ -891,6 +900,7 @@ async fn describe_table_version( let request = DescribeTableVersionRequest { id: Some(parse_id(&id, query.delimiter.as_deref())), version: body.version, + branch: body.branch, identity: extract_identity(&headers), ..Default::default() }; @@ -912,6 +922,7 @@ async fn batch_delete_table_versions( id: Some(parse_id(&id, params.delimiter.as_deref())), identity: extract_identity(&headers), ranges: body.ranges, + branch: body.branch, ..Default::default() }; @@ -1261,6 +1272,62 @@ async fn update_table_tag( } } +// ============================================================================ +// Branch Operation Handlers +// ============================================================================ + +async fn create_table_branch( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.create_table_branch(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_branches( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableBranchesRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() + }; + + match backend.list_table_branches(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_table_branch( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.delete_table_branch(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + // ============================================================================ // Query Plan Operation Handlers // ============================================================================ @@ -1456,15 +1523,28 @@ mod tests { impl RestServerFixture { async fn new() -> Self { + Self::build(false).await + } + + /// Like [`Self::new`], with managed versioning (table version + /// tracking through the `__manifest` catalog) enabled on the + /// backend. + async fn new_managed() -> Self { + Self::build(true).await + } + + async fn build(managed_versioning: bool) -> Self { let temp_dir = TempDir::new().unwrap(); let temp_path = temp_dir.path().to_str().unwrap().to_string(); // Create DirectoryNamespace backend with manifest enabled - let backend = DirectoryNamespaceBuilder::new(&temp_path) - .manifest_enabled(true) - .build() - .await - .unwrap(); + let mut builder = DirectoryNamespaceBuilder::new(&temp_path).manifest_enabled(true); + if managed_versioning { + builder = builder + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true); + } + let backend = builder.build().await.unwrap(); let backend = Arc::new(backend); // Start REST server with port 0 (OS assigns available port) @@ -3189,6 +3269,455 @@ mod tests { ); } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_branch_param_forwarded_end_to_end() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["branch_fwd_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "branch_fwd_ns".to_string(), + "branch_fwd_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + + let id = vec!["branch_fwd_ns".to_string(), "branch_fwd_table".to_string()]; + + // Control: no branch succeeds (resolves the main chain). + assert!( + fixture + .namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .is_ok() + ); + + // list forwards branch as a query param; a bogus branch 404s at the backend. + assert!( + fixture + .namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(id.clone()), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await + .is_err(), + "branch must be forwarded as a query param and honored by the backend" + ); + + // describe carries branch in the request body; a bogus branch likewise 404s. + assert!( + fixture + .namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(id.clone()), + branch: Some("ghost".to_string()), + ..Default::default() + }) + .await + .is_err(), + "branch must be forwarded in the request body and honored by the backend" + ); + + fixture.server_handle.shutdown(); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_branch_crud_end_to_end() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["branch_crud_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "branch_crud_ns".to_string(), + "branch_crud_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + + let id = vec![ + "branch_crud_ns".to_string(), + "branch_crud_table".to_string(), + ]; + + // create -> list shows it (client -> server -> directory backend) + fixture + .namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let listed = fixture + .namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert!( + listed.branches.contains_key("dev"), + "created branch should appear in list: {:?}", + listed.branches + ); + + // duplicate create -> 409 Conflict + let port = fixture.server_handle.port(); + let client = reqwest::Client::new(); + let table_path = "branch_crud_ns%24branch_crud_table"; + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/{}/branches/create", + port, table_path + )) + .query(&[("delimiter", "$")]) + .json(&serde_json::json!({ "name": "dev" })) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 409, + "duplicate branch create should map to 409, got {}", + resp.status() + ); + + // delete -> list no longer shows it + fixture + .namespace + .delete_table_branch(DeleteTableBranchRequest { + id: Some(id.clone()), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let listed = fixture + .namespace + .list_table_branches(ListTableBranchesRequest { + id: Some(id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert!( + !listed.branches.contains_key("dev"), + "deleted branch must not appear in list: {:?}", + listed.branches + ); + + // delete missing -> 404 Not Found (raw HTTP validates TableBranchNotFound -> 404). + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/{}/branches/delete", + port, table_path + )) + .query(&[("delimiter", "$")]) + .json(&serde_json::json!({ "name": "dev" })) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 404, + "deleting a missing branch should map to 404, got {}", + resp.status() + ); + + fixture.server_handle.shutdown(); + } + + /// The managed (manifest-tracked) branch flow over REST: create a + /// managed table and a branch through the RestNamespace client, open + /// the branch via `from_namespace(...).with_branch`, commit on it, + /// check out across branches at an overlapping version number, and + /// round-trip a branch-pointing tag. Mirrors the DirectoryNamespace + /// e2e to prove the REST layer forwards everything the managed commit + /// store needs. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_namespace_managed_branch_e2e() { + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use arrow::record_batch::{RecordBatch, RecordBatchIterator}; + use futures::TryStreamExt; + use lance::dataset::builder::DatasetBuilder; + use lance::dataset::refs::Ref; + use lance::dataset::{Dataset, WriteMode, WriteParams}; + use lance_namespace::LanceNamespace; + + async fn scan_ids(ds: &Dataset) -> Vec { + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort(); + ids + } + + let fixture = RestServerFixture::new_managed().await; + let namespace = Arc::new(fixture.namespace.clone()) as Arc; + let table_id = vec!["mb_table".to_string()]; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = |seed: i32| { + RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(vec![seed]))]) + .unwrap() + }; + + // Managed main: v1 (id=1), v2 (id=2). + let mut main_ds = Dataset::write_into_namespace( + RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()), + namespace.clone(), + table_id.clone(), + Some(WriteParams { + mode: WriteMode::Create, + ..Default::default() + }), + ) + .await + .unwrap(); + main_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()), + None, + ) + .await + .unwrap(); + + // The REST layer must surface managed versioning for the deferred + // commit handler to engage. + let described = namespace + .describe_table(DescribeTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + described.managed_versioning, + Some(true), + "managed_versioning must survive the REST round trip" + ); + let main_chain_len = |ns: Arc, table_id: Vec| async move { + ns.list_table_versions(ListTableVersionsRequest { + id: Some(table_id), + ..Default::default() + }) + .await + .unwrap() + .versions + .len() + }; + assert_eq!(main_chain_len(namespace.clone(), table_id.clone()).await, 2); + + // Branch via the REST client, then open and commit on it. + namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(table_id.clone()), + name: "exp".to_string(), + ..Default::default() + }) + .await + .unwrap(); + let mut branch_ds = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) + .await + .unwrap() + .with_branch("exp", None) + .load() + .await + .unwrap(); + assert_eq!(branch_ds.manifest().branch.as_deref(), Some("exp")); + assert!( + branch_ds + .branch_location() + .path + .as_ref() + .ends_with("tree/exp"), + "the branch dataset must be rooted at the branch chain" + ); + branch_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()), + None, + ) + .await + .unwrap(); + assert_eq!(branch_ds.manifest().branch.as_deref(), Some("exp")); + assert_eq!(scan_ids(&branch_ds).await, vec![1, 2, 3]); + assert_eq!( + main_chain_len(namespace.clone(), table_id.clone()).await, + 2, + "a branch commit must not advance main's chain" + ); + + // Cross-branch checkout at an overlapping version number must land + // on the branch chain (branch numbering continues from the fork + // point, so both chains have this version). + let overlap_version = branch_ds.version().version; + while main_ds.version().version < overlap_version { + main_ds + .append( + RecordBatchIterator::new(vec![Ok(batch(100))], schema.clone()), + None, + ) + .await + .unwrap(); + } + let on_branch = main_ds + .checkout_version(Ref::Version(Some("exp".to_string()), Some(overlap_version))) + .await + .unwrap(); + assert_eq!(on_branch.manifest().branch.as_deref(), Some("exp")); + assert_eq!(scan_ids(&on_branch).await, vec![1, 2, 3]); + let on_branch_latest = main_ds.checkout_branch("exp").await.unwrap(); + assert_eq!(on_branch_latest.manifest().branch.as_deref(), Some("exp")); + + // Branch-pointing tag round trip through the builder. + main_ds + .tags() + .create("exp-tag", ("exp", Some(overlap_version))) + .await + .unwrap(); + let tag_open = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) + .await + .unwrap() + .with_tag("exp-tag") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest().branch.as_deref(), Some("exp")); + assert_eq!(tag_open.version().version, overlap_version); + assert_eq!(scan_ids(&tag_open).await, vec![1, 2, 3]); + + fixture.server_handle.shutdown(); + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_list_branches_bodyless_post() { + let fixture = RestServerFixture::new().await; + + fixture + .namespace + .create_namespace(CreateNamespaceRequest { + id: Some(vec!["list_post_ns".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + fixture + .namespace + .create_table( + CreateTableRequest { + id: Some(vec![ + "list_post_ns".to_string(), + "list_post_table".to_string(), + ]), + mode: Some("create".to_string()), + ..Default::default() + }, + create_test_arrow_data(), + ) + .await + .unwrap(); + fixture + .namespace + .create_table_branch(CreateTableBranchRequest { + id: Some(vec![ + "list_post_ns".to_string(), + "list_post_table".to_string(), + ]), + name: "dev".to_string(), + ..Default::default() + }) + .await + .unwrap(); + + let port = fixture.server_handle.port(); + let client = reqwest::Client::new(); + let resp = client + .post(format!( + "http://127.0.0.1:{}/v1/table/list_post_ns%24list_post_table/branches/list", + port + )) + .query(&[("delimiter", "$")]) + .send() + .await + .unwrap(); + assert_eq!( + resp.status(), + 200, + "bodyless list POST should succeed, got {}", + resp.status() + ); + let body: ListTableBranchesResponse = resp.json().await.unwrap(); + assert!( + body.branches.contains_key("dev"), + "bodyless list should return the branch, got: {:?}", + body.branches + ); + + fixture.server_handle.shutdown(); + } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_describe_table_version() { let fixture = RestServerFixture::new().await; diff --git a/rust/lance-namespace/src/error.rs b/rust/lance-namespace/src/error.rs index 8a73d4db8e8..5ed05541be3 100644 --- a/rust/lance-namespace/src/error.rs +++ b/rust/lance-namespace/src/error.rs @@ -78,6 +78,10 @@ pub enum ErrorCode { TableSchemaValidationError = 20, /// Request was throttled due to rate limiting or too many concurrent operations Throttling = 21, + /// The specified table branch does not exist + TableBranchNotFound = 22, + /// A table branch with this name already exists + TableBranchAlreadyExists = 23, } impl ErrorCode { @@ -113,6 +117,8 @@ impl ErrorCode { 19 => Some(Self::InvalidTableState), 20 => Some(Self::TableSchemaValidationError), 21 => Some(Self::Throttling), + 22 => Some(Self::TableBranchNotFound), + 23 => Some(Self::TableBranchAlreadyExists), _ => None, } } @@ -143,6 +149,8 @@ impl std::fmt::Display for ErrorCode { Self::InvalidTableState => "InvalidTableState", Self::TableSchemaValidationError => "TableSchemaValidationError", Self::Throttling => "Throttling", + Self::TableBranchNotFound => "TableBranchNotFound", + Self::TableBranchAlreadyExists => "TableBranchAlreadyExists", }; write!(f, "{}", name) } @@ -260,6 +268,14 @@ pub enum NamespaceError { /// Request was throttled due to rate limiting or too many concurrent operations. #[snafu(display("Throttling: {message}"))] Throttling { message: String }, + + /// The specified table branch does not exist. + #[snafu(display("Table branch not found: {message}"))] + TableBranchNotFound { message: String }, + + /// A table branch with this name already exists. + #[snafu(display("Table branch already exists: {message}"))] + TableBranchAlreadyExists { message: String }, } impl NamespaceError { @@ -291,7 +307,9 @@ impl NamespaceError { | Self::Internal { message } | Self::InvalidTableState { message } | Self::TableSchemaValidationError { message } - | Self::Throttling { message } => message, + | Self::Throttling { message } + | Self::TableBranchNotFound { message } + | Self::TableBranchAlreadyExists { message } => message, } } @@ -322,6 +340,8 @@ impl NamespaceError { Self::InvalidTableState { .. } => ErrorCode::InvalidTableState, Self::TableSchemaValidationError { .. } => ErrorCode::TableSchemaValidationError, Self::Throttling { .. } => ErrorCode::Throttling, + Self::TableBranchNotFound { .. } => ErrorCode::TableBranchNotFound, + Self::TableBranchAlreadyExists { .. } => ErrorCode::TableBranchAlreadyExists, } } @@ -355,6 +375,8 @@ impl NamespaceError { Self::TableSchemaValidationError { message } } Some(ErrorCode::Throttling) => Self::Throttling { message }, + Some(ErrorCode::TableBranchNotFound) => Self::TableBranchNotFound { message }, + Some(ErrorCode::TableBranchAlreadyExists) => Self::TableBranchAlreadyExists { message }, None => Self::Internal { message }, } } @@ -380,7 +402,7 @@ mod tests { #[test] fn test_error_code_roundtrip() { - for code in 0..=21 { + for code in 0..=23 { let error_code = ErrorCode::from_u32(code).unwrap(); assert_eq!(error_code.as_u32(), code); } diff --git a/rust/lance-namespace/src/namespace.rs b/rust/lance-namespace/src/namespace.rs index 0ee93b80b2e..faa7ec57367 100644 --- a/rust/lance-namespace/src/namespace.rs +++ b/rust/lance-namespace/src/namespace.rs @@ -14,11 +14,13 @@ use lance_namespace_reqwest_client::models::{ AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, CountTableRowsRequest, CreateMaterializedViewRequest, CreateMaterializedViewResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, - CreateTableTagResponse, CreateTableVersionRequest, CreateTableVersionResponse, - DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, - DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + CreateNamespaceResponse, CreateTableBranchRequest, CreateTableBranchResponse, + CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + CreateTableVersionRequest, CreateTableVersionResponse, DeclareTableRequest, + DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, DeleteTableBranchResponse, DeleteTableTagRequest, + DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, DescribeTableVersionRequest, DescribeTableVersionResponse, DescribeTransactionRequest, @@ -27,7 +29,8 @@ use lance_namespace_reqwest_client::models::{ ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableBranchesRequest, ListTableBranchesResponse, ListTableIndicesRequest, + ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, QueryTableRequest, RefreshMaterializedViewRequest, RefreshMaterializedViewResponse, @@ -500,6 +503,44 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { Err(Error::not_supported("update_table_tag not implemented")) } + /// Create a branch for a table. + /// + /// The new branch forks from the source ref selected by `from_branch` and + /// `from_version`, defaulting to the latest version of the main branch when + /// both are omitted. + /// + /// # Errors + /// + /// - Returns [`crate::ErrorCode::TableBranchAlreadyExists`] if a branch with the same name already exists. + /// - Returns [`crate::ErrorCode::TableNotFound`] if the table does not exist. + /// - Returns [`crate::ErrorCode::InvalidInput`] if `from_branch` or `from_version` references a source that does not exist. + async fn create_table_branch( + &self, + _request: CreateTableBranchRequest, + ) -> Result { + Err(Error::not_supported("create_table_branch not implemented")) + } + + /// List all branches for a table. + async fn list_table_branches( + &self, + _request: ListTableBranchesRequest, + ) -> Result { + Err(Error::not_supported("list_table_branches not implemented")) + } + + /// Delete a branch from a table. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::TableBranchNotFound`] if the branch does not exist. + async fn delete_table_branch( + &self, + _request: DeleteTableBranchRequest, + ) -> Result { + Err(Error::not_supported("delete_table_branch not implemented")) + } + /// Return a human-readable unique identifier for this namespace instance. /// /// This is used for equality comparison and hashing when the namespace is diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index bc15fe669a7..c9cc356aaa6 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -69,7 +69,7 @@ use std::sync::Arc; use tracing::{info, instrument}; pub(crate) mod blob; -mod branch_location; +pub(crate) mod branch_location; pub mod builder; pub mod cleanup; pub mod delta; @@ -501,11 +501,14 @@ impl Dataset { ) -> Result { let (source_branch, version_number) = self.resolve_reference(version.into()).await?; let branch_location = self.branch_location().find_branch(Some(branch))?; + let source_location = self + .branch_location() + .find_branch(source_branch.as_deref())?; let clone_op = Operation::Clone { is_shallow: true, ref_name: source_branch.clone(), ref_version: version_number, - ref_path: String::from(self.uri()), + ref_path: source_location.uri, branch_name: Some(branch.to_string()), }; let transaction = Transaction::new(version_number, clone_op, None); @@ -557,6 +560,15 @@ impl Dataset { version_number: Option, branch: Option<&str>, ) -> Result { + // Reject malformed names at the boundary (mirroring the branch CRUD + // paths) so they fail as InvalidRef instead of tripping the wrong-chain + // check below + if let Some(branch_name) = branch + && !Branches::is_main_branch(branch) + { + refs::check_valid_branch(branch_name)?; + } + let new_location = self.branch_location().find_branch(branch)?; let manifest_location = if let Some(version_number) = version_number { @@ -584,6 +596,21 @@ impl Dataset { self.session.as_ref(), ) .await?; + + // The resolved manifest must belong to the requested branch. A mismatch + // means the commit handler resolved against a different chain (for + // example an external manifest store that ignores branch-qualified + // paths); error loudly rather than hand back another branch's data. + let requested_branch = branch.and_then(refs::standardize_branch); + if manifest.branch.as_deref() != requested_branch.as_deref() { + return Err(Error::internal(format!( + "checkout of branch '{}' at version {} resolved a manifest belonging to branch '{}'", + refs::normalize_branch(branch), + manifest.version, + refs::normalize_branch(manifest.branch.as_deref()), + ))); + } + Self::checkout_manifest( self.object_store.clone(), new_location.path, @@ -780,12 +807,50 @@ impl Dataset { batches: impl RecordBatchReader + Send + 'static, namespace_client: Arc, table_id: Vec, + params: Option, + ) -> Result { + Self::write_into_namespace_impl(batches, namespace_client, table_id, None, params).await + } + + /// Write into a branch of a namespace client-managed table. + /// + /// Behaves like [`write_into_namespace`](Self::write_into_namespace), but APPEND and + /// OVERWRITE open and commit against `branch` instead of main. CREATE is rejected, + /// since a branch forks from an existing version. + pub async fn write_into_namespace_on_branch( + batches: impl RecordBatchReader + Send + 'static, + namespace_client: Arc, + table_id: Vec, + branch: &str, + params: Option, + ) -> Result { + Self::write_into_namespace_impl( + batches, + namespace_client, + table_id, + Some(branch.to_string()), + params, + ) + .await + } + + async fn write_into_namespace_impl( + batches: impl RecordBatchReader + Send + 'static, + namespace_client: Arc, + table_id: Vec, + branch: Option, mut params: Option, ) -> Result { let mut write_params = params.take().unwrap_or_default(); match write_params.mode { WriteMode::Create => { + if branch.is_some() { + return Err(Error::not_supported_source( + "cannot create a table on a branch; create on main first, then branch it" + .into(), + )); + } let declare_request = DeclareTableRequest { id: Some(table_id.clone()), ..Default::default() @@ -803,10 +868,13 @@ impl Dataset { // Set up commit handler when managed_versioning is enabled if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( + // The store derives the branch a request targets from the + // base path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( namespace_client.clone(), table_id.clone(), - ); + &uri, + )?; let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), @@ -858,18 +926,25 @@ impl Dataset { ))) })?; - // Set up commit handler when managed_versioning is enabled - if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( - namespace_client.clone(), - table_id.clone(), - ); - let commit_handler: Arc = - Arc::new(ExternalManifestCommitHandler { + // Set up commit handler when managed_versioning is enabled. + // It must ride on the dataset opened below: InsertBuilder + // commits through the destination dataset's handler and does + // not consult write params for Dataset destinations. + let commit_handler: Option> = + if response.managed_versioning == Some(true) { + // The store derives the branch a request targets from the + // base path it is handed, resolved against the table root. + let external_store = LanceNamespaceExternalManifestStore::for_table_uri( + namespace_client.clone(), + table_id.clone(), + uri.as_str(), + )?; + Some(Arc::new(ExternalManifestCommitHandler { external_manifest_store: Arc::new(external_store), - }); - write_params.commit_handler = Some(commit_handler); - } + })) + } else { + None + }; // Set initial credentials and provider from namespace_client if let Some(namespace_storage_options) = response.storage_options { @@ -908,6 +983,12 @@ impl Dataset { { builder = builder.with_storage_options_accessor(accessor.clone()); } + if let Some(commit_handler) = commit_handler { + builder = builder.with_commit_handler(commit_handler); + } + if let Some(branch) = &branch { + builder = builder.with_branch(branch, None); + } let dataset = Arc::new(builder.load().await?); Self::write(batches, dataset, Some(write_params)).await @@ -2511,11 +2592,12 @@ impl Dataset { store_params: Option, ) -> Result { let (ref_name, version_number) = self.resolve_reference(version.into()).await?; + let source_location = self.branch_location().find_branch(ref_name.as_deref())?; let clone_op = Operation::Clone { is_shallow: true, ref_name, ref_version: version_number, - ref_path: self.uri.clone(), + ref_path: source_location.uri, branch_name: None, }; let transaction = Transaction::new(version_number, clone_op, None); diff --git a/rust/lance/src/dataset/branch_location.rs b/rust/lance/src/dataset/branch_location.rs index 2dd9f3aa860..3a1185c8cf8 100644 --- a/rust/lance/src/dataset/branch_location.rs +++ b/rust/lance/src/dataset/branch_location.rs @@ -62,6 +62,38 @@ impl BranchLocation { Ok(root_path_str) } + /// The branch a location under `root` targets: the inverse of + /// [`Self::find_branch`]. `location` must be either `root` itself (main) + /// or `/tree/`; anything else is rejected so a caller never + /// misattributes an unrelated location to a branch. + pub fn branch_of(root: &str, location: &str) -> Result> { + if location == root { + return Ok(None); + } + // Require the `/` component boundary after the root so a sibling path + // that merely shares the root as a string prefix is rejected. + let branch = location + .strip_prefix(root) + .and_then(|rel| { + if root.is_empty() { + Some(rel) + } else { + rel.strip_prefix('/') + } + }) + .and_then(|rel| rel.strip_prefix(BRANCH_DIR)) + .and_then(|rel| rel.strip_prefix('/')) + .filter(|name| !name.is_empty()); + + match branch { + Some(name) => Ok(Some(name.to_string())), + None => Err(Error::invalid_input(format!( + "cannot derive a branch for location '{}': expected the table root '{}' or a branch chain under '{}/{}'", + location, root, root, BRANCH_DIR + ))), + } + } + /// Find the target branch location pub fn find_branch(&self, branch_name: Option<&str>) -> Result { if branch_name == self.branch.as_deref() { @@ -223,6 +255,35 @@ mod tests { assert!(fs::create_dir_all(std::path::Path::new(new_location.uri.as_str())).is_ok()); } + #[test] + fn test_branch_of() { + let derive = |root: &str, location: &str| BranchLocation::branch_of(root, location); + + // The table root targets main. + assert_eq!(derive("data/t.lance", "data/t.lance").unwrap(), None); + + // Branch chains, including multi-segment branch names. + assert_eq!( + derive("data/t.lance", "data/t.lance/tree/exp").unwrap(), + Some("exp".to_string()) + ); + assert_eq!( + derive("data/t.lance", "data/t.lance/tree/bugfix/issue-123").unwrap(), + Some("bugfix/issue-123".to_string()) + ); + + // A sibling path sharing the root as a string prefix is not a branch. + assert!(derive("data/t", "data/tx/tree/exp").is_err()); + // Neither is a sub-path outside the branch directory. + assert!(derive("data/t.lance", "data/t.lance/other/exp").is_err()); + // Nor a path missing the component boundary after the branch dir. + assert!(derive("data/t.lance", "data/t.lance/treex").is_err()); + // An empty branch name is invalid. + assert!(derive("data/t.lance", "data/t.lance/tree").is_err()); + // An unrelated location is invalid. + assert!(derive("data/t.lance", "elsewhere/u.lance").is_err()); + } + #[test] fn test_find_empty_branch() { let root_path = TempStdDir::default().to_owned(); diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 393ff45c4ea..23db7254b15 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -4,7 +4,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; use lance_core::cache::CacheBackend; -use super::refs::{Ref, Refs}; +use super::refs::{Branches, Ref, Refs, check_valid_branch, normalize_branch, standardize_branch}; use super::{DEFAULT_INDEX_CACHE_SIZE, DEFAULT_METADATA_CACHE_SIZE, ReadParams, WriteParams}; use crate::dataset::branch_location::BranchLocation; use crate::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore; @@ -53,6 +53,12 @@ pub struct DatasetBuilder { storage_options_override: Option>, /// Runtime-only exact object store bindings keyed by base path URI. base_store_params: HashMap, + /// Namespace-managed table info `(client, table_id)`, set by `from_namespace` + /// when the table uses managed versioning. The commit handler is built in + /// `build_object_store`, rooted at the resolved table path; the branch a + /// namespace request targets is derived per call from the base path the + /// handler is handed. + namespace_managed: Option<(Arc, Vec)>, } impl std::fmt::Debug for DatasetBuilder { @@ -71,6 +77,7 @@ impl std::fmt::Debug for DatasetBuilder { &self.storage_options_override.is_some(), ) .field("base_store_params", &!self.base_store_params.is_empty()) + .field("namespace_managed", &self.namespace_managed.is_some()) .finish() } } @@ -90,6 +97,7 @@ impl DatasetBuilder { file_reader_options: None, storage_options_override: None, base_store_params: HashMap::new(), + namespace_managed: None, } } @@ -149,16 +157,11 @@ impl DatasetBuilder { let mut builder = Self::from_uri(&table_uri); - // Check managed_versioning flag to determine if namespace-managed commits should be used + // Defer building the commit handler to load(): the manifest store is + // rooted at the resolved table path, which is only known once the + // object store is built. if response.managed_versioning == Some(true) { - let external_store = LanceNamespaceExternalManifestStore::new( - namespace_client.clone(), - table_id.clone(), - ); - let commit_handler: Arc = Arc::new(ExternalManifestCommitHandler { - external_manifest_store: Arc::new(external_store), - }); - builder.commit_handler = Some(commit_handler); + builder.namespace_managed = Some((namespace_client.clone(), table_id.clone())); } // Use namespace storage options if available @@ -524,13 +527,8 @@ impl DatasetBuilder { /// Build a lance object store for the given config pub async fn build_object_store( - self, + mut self, ) -> Result<(Arc, Path, Arc)> { - let commit_handler = match self.commit_handler { - Some(commit_handler) => Ok(commit_handler), - None => commit_handler_from_url(&self.table_uri, &Some(self.options.clone())).await, - }?; - let storage_options = self .options .storage_options() @@ -546,13 +544,13 @@ impl DatasetBuilder { .unwrap_or_default(); #[allow(deprecated)] - match &self.options.object_store { - Some(store) => Ok(( + let (object_store, base_path) = match &self.options.object_store { + Some(store) => ( Arc::new(ObjectStore::new( store.0.clone(), store.1.clone(), self.options.block_size, - self.options.object_store_wrapper, + self.options.object_store_wrapper.clone(), self.options.use_constant_size_upload_parts, store.1.scheme() != "file", // If user supplied an object store then we just assume it's probably @@ -562,18 +560,35 @@ impl DatasetBuilder { None, // No storage_options available here )), Path::from(store.1.path()), - commit_handler, - )), + ), None => { - let (store, path) = ObjectStore::from_uri_and_params( - store_registry, - &self.table_uri, - &self.options, - ) - .await?; - Ok((store, path, commit_handler)) + ObjectStore::from_uri_and_params(store_registry, &self.table_uri, &self.options) + .await? } - } + }; + + // Resolve the commit handler: an explicitly set one wins; otherwise a + // namespace-managed table builds a manifest store rooted at the resolved + // table path (the branch a request targets is derived per call from the + // base path the handler is handed); otherwise fall back to the default + // for the uri. Resolving here (not in load) keeps this pub method + // consistent for every caller. + let commit_handler: Arc = + if let Some(commit_handler) = self.commit_handler.take() { + commit_handler + } else if let Some((namespace_client, table_id)) = self.namespace_managed.take() { + Arc::new(ExternalManifestCommitHandler { + external_manifest_store: Arc::new(LanceNamespaceExternalManifestStore::new( + namespace_client, + table_id, + base_path.clone(), + )), + }) + } else { + commit_handler_from_url(&self.table_uri, &Some(self.options.clone())).await? + }; + + Ok((object_store, base_path, commit_handler)) } #[instrument(skip_all)] @@ -656,6 +671,14 @@ impl DatasetBuilder { let store_params = self.options.clone(); let base_store_params = (!self.base_store_params.is_empty()) .then(|| Arc::new(std::mem::take(&mut self.base_store_params))); + + // A namespace-managed table is always addressed at its root uri, so the + // effective branch is resolvable before loading: the base path is + // qualified up front and the manifest store derives the branch from it. + // An explicitly supplied commit handler opts out of the managed flow. + let managed_store_active = + self.namespace_managed.is_some() && self.commit_handler.is_none(); + let (object_store, base_path, commit_handler) = self.build_object_store().await?; // Two cases that need to check out after loading the manifest: @@ -667,7 +690,7 @@ impl DatasetBuilder { let mut need_delay_checkout = false; let (mut branch, mut version_number) = match target_ref.clone() { Some(Ref::Version(branch, version_number)) => { - if branch.is_some() { + if branch.is_some() && !managed_store_active { need_delay_checkout = true; } (branch, version_number) @@ -687,17 +710,57 @@ impl DatasetBuilder { branch: None, }, ); - let tag_content = refs.tags().get(&tag_name).await; - if let Ok(tag_content) = tag_content { - (tag_content.branch.clone(), Some(tag_content.version)) - } else { - need_delay_checkout = true; - (None, None) + match refs.tags().get(&tag_name).await { + Ok(tag_content) => { + if tag_content.branch.is_some() && !managed_store_active { + // The tag's chain lives under a different base path + // and the unmanaged handler resolves versions by + // base path only, so load the root's latest first + // and check the tag's branch/version out from it. + need_delay_checkout = true; + (tag_content.branch, None) + } else { + (tag_content.branch.clone(), Some(tag_content.version)) + } + } + Err(e) => { + // A managed table is always rooted at the namespace + // location, so a tag missing here is missing. + if managed_store_active { + return Err(e); + } + need_delay_checkout = true; + (None, None) + } } } None => (None, None), }; + // Reject malformed branch names at the boundary (mirroring the branch + // CRUD paths) so they fail as InvalidRef instead of resolving oddly + if let Some(branch_name) = branch.as_deref() + && !Branches::is_main_branch(Some(branch_name)) + { + check_valid_branch(branch_name)?; + } + + // For a managed table the branch is known before loading; point the base + // path and uri at the branch chain so the loaded dataset is rooted there + // (data placement, refs and the path-derived store branch all follow the + // base path). + let (base_path, table_uri) = if managed_store_active && branch.is_some() { + let branch_location = BranchLocation { + path: base_path, + uri: table_uri, + branch: None, + } + .find_branch(branch.as_deref())?; + (branch_location.path, branch_location.uri) + } else { + (base_path, table_uri) + }; + let dataset = Self::load_by_uri( session, manifest, @@ -712,6 +775,20 @@ impl DatasetBuilder { ) .await?; + if managed_store_active { + // The base path was qualified above, so the loaded manifest must + // already be on the requested branch; a mismatch means the namespace + // resolved another chain. + let requested_branch = branch.as_deref().and_then(standardize_branch); + if dataset.manifest.branch.as_deref() != requested_branch.as_deref() { + return Err(Error::internal(format!( + "open of branch '{}' resolved a manifest belonging to branch '{}'", + normalize_branch(branch.as_deref()), + normalize_branch(dataset.manifest.branch.as_deref()), + ))); + } + } + if need_delay_checkout { if let Some(Ref::Tag(tag_name)) = target_ref { let tag_content = dataset.tags().get(tag_name.as_str()).await?; diff --git a/rust/lance/src/dataset/tests/dataset_versioning.rs b/rust/lance/src/dataset/tests/dataset_versioning.rs index 5ac01c498b2..a0bc7816a32 100644 --- a/rust/lance/src/dataset/tests/dataset_versioning.rs +++ b/rust/lance/src/dataset/tests/dataset_versioning.rs @@ -565,6 +565,65 @@ async fn test_fragment_id_never_reset() { assert_eq!(dataset.manifest.max_fragment_id(), Some(4)); } +/// create_branch and shallow_clone must read the SOURCE ref's chain, not the +/// receiver's. Both chains get a version 2 with diverged row counts so a clone +/// that wrongly resolves the version under the receiver succeeds silently with +/// the wrong data. +#[tokio::test] +async fn test_create_branch_and_shallow_clone_from_other_branch() { + let tempdir = TempDir::default(); + let test_uri = tempdir.path_str(); + + let gen_rows = |start: i32, rows: u64| { + gen_batch() + .col("id", array::step_custom::(start, 1)) + .into_reader_rows(RowCount::from(rows), BatchCount::from(1)) + }; + let write = |uri: String, start: i32, rows: u64, mode: WriteMode| async move { + Dataset::write( + gen_rows(start, rows), + uri.as_str(), + Some(WriteParams { + mode, + ..Default::default() + }), + ) + .await + .unwrap() + }; + + // main v1: 50 rows. + let mut main_ds = write(test_uri.clone(), 0, 50, WriteMode::Create).await; + // dev: forked at v1, appended 30 rows -> dev v2 has 80 rows. + let dev_ds = main_ds.create_branch("dev", 1, None).await.unwrap(); + write(dev_ds.uri().to_string(), 1000, 30, WriteMode::Append).await; + // Diverge main to the same version number with a different row count. + let mut main_ds = write(test_uri.clone(), 5000, 10, WriteMode::Append).await; // main v2: 60 rows + + // Cross-source create_branch: receiver is main, source is dev. + let child_ds = main_ds + .create_branch("child", ("dev", 2), None) + .await + .unwrap(); + assert_eq!( + child_ds.count_rows(None).await.unwrap(), + 80, + "child must clone dev@2, not main@2" + ); + + // Cross-source shallow_clone: same rule. + let clone_uri = format!("{}_clone", test_uri); + let cloned_ds = main_ds + .shallow_clone(&clone_uri, ("dev", 2), None) + .await + .unwrap(); + assert_eq!( + cloned_ds.count_rows(None).await.unwrap(), + 80, + "shallow clone must read dev@2, not main@2" + ); +} + #[tokio::test] async fn test_branch() { let tempdir = TempDir::default(); @@ -797,6 +856,86 @@ async fn test_branch() { "branch1" ); + // Opening at a branch-pointing tag through the builder must check out the + // tag's branch chain, not main's chain at the tag's version number. + let tag_open = DatasetBuilder::from_uri(&test_uri) + .with_tag("tag1") + .load() + .await + .unwrap(); + assert_eq!(tag_open.manifest.branch.as_deref(), Some("dev/branch2")); + assert_eq!(tag_open.version().version, 3); + assert_eq!(tag_open.count_rows(None).await.unwrap(), 100); + + // Malformed branch names are rejected at the boundary + for bad_name in ["", "branch1/"] { + let err = main_dataset + .checkout_version((Some(bad_name), None::)) + .await + .unwrap_err(); + assert!( + matches!(err, Error::InvalidRef { .. }), + "checkout of {:?} must be rejected as InvalidRef, got: {}", + bad_name, + err + ); + let err = DatasetBuilder::from_uri(&test_uri) + .with_branch(bad_name, None) + .load() + .await + .unwrap_err(); + assert!( + matches!(err, Error::InvalidRef { .. }), + "open of {:?} must be rejected as InvalidRef, got: {}", + bad_name, + err + ); + } + + // "main" stays a valid spelling of the main branch on checkout; the JNI + // bindings construct Ref::Version(Some("main"), _) directly. + let main_by_name = checkout_branch1.checkout_branch("main").await.unwrap(); + assert_eq!(main_by_name.manifest.branch, None); + assert_eq!(main_by_name.version().version, 1); + let main_by_ref = checkout_branch1 + .checkout_version(crate::dataset::refs::Ref::Version( + Some("main".to_string()), + None, + )) + .await + .unwrap(); + assert_eq!(main_by_ref.manifest.branch, None); + + // A checkout whose resolved manifest is not on the requested branch must + // error loudly instead of handing back another branch's data: stage main's + // manifest under a branch path that was never created, so resolution finds + // a manifest belonging to main. + use object_store::ObjectStoreExt as _; + let staged_manifest = main_dataset.manifest_location().path.clone(); + let staged_copy = Path::parse(format!( + "{}/tree/ghost/_versions/{}", + test_uri, + staged_manifest.filename().unwrap() + )) + .unwrap(); + main_dataset + .object_store + .inner + .copy(&staged_manifest, &staged_copy) + .await + .unwrap(); + let err = main_dataset.checkout_branch("ghost").await.unwrap_err(); + assert!( + err.to_string().contains("resolved a manifest belonging to"), + "expected the branch-mismatch guardrail, got: {}", + err + ); + main_dataset + .object_store + .remove_dir_all(Path::parse(format!("{}/tree/ghost", test_uri)).unwrap()) + .await + .unwrap(); + let mut dataset = main_dataset; // Finally delete all branches assert!(matches!( diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 2ab34441997..baad71b3e39 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -104,6 +104,10 @@ impl<'a> CommitBuilder<'a> { } /// Pass a commit handler to use for the dataset. + /// + /// Takes precedence over the destination dataset's own handler. If not + /// set, a `Dataset` destination commits through its own handler and a + /// `Uri` destination resolves one from the uri. pub fn with_commit_handler(mut self, commit_handler: Arc) -> Self { self.commit_handler = Some(commit_handler); self @@ -241,7 +245,9 @@ impl<'a> CommitBuilder<'a> { WriteDestination::Dataset(dataset) => ( dataset.object_store.clone(), dataset.base.clone(), - dataset.commit_handler.clone(), + self.commit_handler + .clone() + .unwrap_or_else(|| dataset.commit_handler.clone()), ), WriteDestination::Uri(uri) => { let commit_handler = if let (Some(_), Some(commit_handler)) = diff --git a/rust/lance/src/io/commit/namespace_manifest.rs b/rust/lance/src/io/commit/namespace_manifest.rs index 0587ff96ad4..92d5e7bc789 100644 --- a/rust/lance/src/io/commit/namespace_manifest.rs +++ b/rust/lance/src/io/commit/namespace_manifest.rs @@ -14,27 +14,61 @@ use lance_table::io::commit::{ManifestLocation, ManifestNamingScheme}; use object_store::ObjectStore as OSObjectStore; use object_store::path::Path; +use crate::dataset::branch_location::BranchLocation; + #[derive(Debug)] pub struct LanceNamespaceExternalManifestStore { namespace_client: Arc, table_id: Vec, + /// Object-store path of the table root (the main branch). The base path the + /// trait methods receive is resolved against this to derive which branch a + /// request targets, so a single store serves every branch of the table. + table_root: Path, } impl LanceNamespaceExternalManifestStore { - pub fn new(namespace_client: Arc, table_id: Vec) -> Self { + pub fn new( + namespace_client: Arc, + table_id: Vec, + table_root: Path, + ) -> Self { Self { namespace_client, table_id, + table_root, } } + + /// Build a store for the table rooted at `table_uri`, resolving the root + /// path from the uri without initializing an object store. + pub fn for_table_uri( + namespace_client: Arc, + table_id: Vec, + table_uri: &str, + ) -> Result { + let table_root = lance_io::object_store::ObjectStore::extract_path_from_uri( + Arc::new(lance_io::object_store::ObjectStoreRegistry::default()), + table_uri, + )?; + Ok(Self::new(namespace_client, table_id, table_root)) + } + + /// Derive the branch targeted by `base` (the table root for main, or a + /// branch chain produced by `BranchLocation::find_branch`). The branch + /// path layout is owned by [`BranchLocation`]; this store never parses or + /// constructs it directly. + fn branch_for_base(&self, base: &str) -> Result> { + BranchLocation::branch_of(self.table_root.as_ref(), base) + } } #[async_trait] impl ExternalManifestStore for LanceNamespaceExternalManifestStore { - async fn get(&self, _base_uri: &str, version: u64) -> Result { + async fn get(&self, base_uri: &str, version: u64) -> Result { let request = DescribeTableVersionRequest { id: Some(self.table_id.clone()), version: Some(version as i64), + branch: self.branch_for_base(base_uri)?, ..Default::default() }; @@ -47,11 +81,12 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { Ok(response.version.manifest_path) } - async fn get_latest_version(&self, _base_uri: &str) -> Result> { + async fn get_latest_version(&self, base_uri: &str) -> Result> { let request = ListTableVersionsRequest { id: Some(self.table_id.clone()), descending: Some(true), limit: Some(1), + branch: self.branch_for_base(base_uri)?, ..Default::default() }; @@ -73,7 +108,7 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { /// Put the manifest to the namespace store. async fn put( &self, - _base_path: &Path, + base_path: &Path, version: u64, staging_path: &Path, size: u64, @@ -94,6 +129,7 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { manifest_size: Some(size as i64), e_tag: e_tag.clone(), naming_scheme: Some(naming_scheme_str.to_string()), + branch: self.branch_for_base(base_path.as_ref())?, ..Default::default() }; From 6e676ecaad615d1aa5b2a9a90dae97bc24000607 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 10 Jun 2026 11:11:38 -0700 Subject: [PATCH 075/177] fix(fts): recompute next_id when loading FST token set (#7200) Token ids in a TokenSet are dense [0, len), so next_id must equal the token count. load_fst restored next_id verbatim from the persisted _token_next_id column, so a stale value written by a binary built before #7115 -- whose remap dropped tokens without lowering next_id -- survived the reload. A later segment merge then minted an out-of-range token id and panicked in InnerBuilder::merge_from at self.posting_lists[new_token_id]. Recompute next_id from map.len() (O(1)) instead, mirroring load_arrow, which already does this. Heals existing indexes on read with no migration; writers and file schema are unchanged. Complements 12f529f9 (#7115), which reset next_id after remap in memory but did not heal values already persisted by writers built before it. Co-authored-by: Claude Opus 4.8 (1M context) --- .../src/scalar/inverted/builder.rs | 74 +++++++++++++++++++ rust/lance-index/src/scalar/inverted/index.rs | 10 +-- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 59a992694c9..24b1eb50203 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -3162,6 +3162,80 @@ mod tests { assert!((zeta_id as usize) < first.posting_lists.len()); } + // FST token file with a stale next_id (above the token count), as a pre-#7115 writer left. + async fn write_stale_next_id_token_file(store: &dyn IndexStore, partition_id: u64) { + let mut tokens = TokenSet::default(); + tokens.add("alpha".to_owned()); + tokens.add("gamma".to_owned()); + assert_eq!(tokens.len(), 2); + tokens.next_id = 9; + let batch = tokens.to_batch(TokenSetFormat::Fst).unwrap(); + let mut writer = store + .new_index_file(&token_file_path(partition_id), batch.schema()) + .await + .unwrap(); + writer.write_record_batch(batch).await.unwrap(); + writer.finish().await.unwrap(); + } + + // load_fst recomputes next_id from the token count rather than trusting the persisted value. + #[tokio::test] + async fn test_load_fst_recomputes_stale_next_id() { + let index_dir = TempDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + write_stale_next_id_token_file(store.as_ref(), 0).await; + let reader = store.open_index_file(&token_file_path(0)).await.unwrap(); + let tokens = TokenSet::load(reader, TokenSetFormat::Fst).await.unwrap(); + assert_eq!(tokens.len(), 2); + assert_eq!(tokens.next_id(), 2); + } + + // A stale next_id loaded from disk must not leak an out-of-range token id into a merge. + #[tokio::test] + async fn test_merge_with_stale_next_id_token_file_does_not_panic() { + let index_dir = TempDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + write_stale_next_id_token_file(store.as_ref(), 0).await; + let reader = store.open_index_file(&token_file_path(0)).await.unwrap(); + let tokens = TokenSet::load(reader, TokenSetFormat::Fst) + .await + .unwrap() + .into_mutable(); + + let mut first = InnerBuilder::new(0, false, TokenSetFormat::Fst); + first.set_tokens(tokens); + first + .posting_lists + .resize_with(first.tokens.len(), || PostingListBuilder::new(false)); + let doc = first.docs.append(10, 1); + first.posting_lists[0].add(doc, PositionRecorder::Count(1)); + first.posting_lists[1].add(doc, PositionRecorder::Count(1)); + + let mut second = InnerBuilder::new(1, false, TokenSetFormat::Fst); + let zeta = second.tokens.add("zeta".to_owned()); + second + .posting_lists + .resize_with(second.tokens.len(), || PostingListBuilder::new(false)); + let second_doc = second.docs.append(20, 1); + second.posting_lists[zeta as usize].add(second_doc, PositionRecorder::Count(1)); + + first.merge_from(second).unwrap(); + assert_eq!(first.tokens.len(), 3); + assert_eq!(first.posting_lists.len(), 3); + let zeta_id = first.tokens.get("zeta").expect("zeta should be merged in"); + assert!((zeta_id as usize) < first.posting_lists.len()); + } + #[tokio::test] async fn test_update_index_returns_worker_error_when_workers_exit_during_dispatch() { let num_batches = (*LANCE_FTS_NUM_SHARDS * 2 + 1) as u64; diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 03462270858..43a5e9da373 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -1620,15 +1620,13 @@ impl TokenSet { let map = fst::Map::new(bytes.to_vec()) .map_err(|e| Error::index(format!("failed to load fst tokens: {}", e)))?; - let next_id_col = batch[TOKEN_NEXT_ID_COL].as_primitive::(); let total_length_col = batch[TOKEN_TOTAL_LENGTH_COL].as_primitive::(); - let next_id = next_id_col - .values() - .first() - .copied() - .ok_or(Error::index("token next id column is empty".to_owned()))?; + // Token ids are dense `[0, len)`, so `next_id` must equal the token count. Recompute + // it instead of trusting the persisted value, which writers before #7115 could leave + // stale. Mirrors `load_arrow`. + let next_id = map.len() as u32; let total_length = total_length_col .values() From 0dd8c3bf1a6059b45b8386cb1219f65d604b87f5 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 10 Jun 2026 12:03:48 -0700 Subject: [PATCH 076/177] perf(fts): stream prewarm posting lists in chunks (#7208) Stream each inverted-index partition's posting lists in bounded token-row chunks (read -> build -> insert -> drop) so prewarm's peak resident set is ~one chunk rather than the whole invert.lance, instead of loading whole positional posting segments at once. The chunk target is a fixed 32 MiB. ENT-1663 Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar.rs | 5 + rust/lance-index/src/scalar/inverted/index.rs | 738 ++++++++++++++++-- rust/lance-index/src/scalar/lance_format.rs | 6 + 3 files changed, 664 insertions(+), 85 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index a32ef4f9447..daec92339f8 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -256,6 +256,11 @@ pub trait IndexReader: Send + Sync { fn num_rows(&self) -> usize; /// Return the metadata of the file fn schema(&self) -> &lance_core::datatypes::Schema; + /// Best-effort on-disk byte size of the file when the reader already knows it + /// without extra I/O, else `None`. Used to size prewarm chunks. + fn file_size_bytes(&self) -> Option { + None + } } /// Trait abstracting I/O away from index logic diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 43a5e9da373..56547c6510b 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -1023,6 +1023,52 @@ impl Index for InvertedIndex { } } +/// Target on-disk size of one prewarm chunk; a partition is streamed in chunks of +/// ~this size so its peak resident set is one chunk, not the whole `invert.lance`. +const PREWARM_CHUNK_TARGET_BYTES: u64 = 32 << 20; + +/// Cap on token rows per chunk, bounding the built `Vec` when posting lists are tiny. +const PREWARM_MAX_CHUNK_TOKENS: usize = 4096; + +/// Floor on token rows per chunk, so a partition always makes progress. +const PREWARM_MIN_CHUNK_TOKENS: usize = 1; + +/// Token rows per chunk: byte target / average bytes-per-token, clamped to `[MIN, MAX]`. +fn prewarm_chunk_tokens(token_count: usize, file_size_bytes: u64) -> usize { + if token_count == 0 { + return PREWARM_MIN_CHUNK_TOKENS; + } + let bytes_per_token = (file_size_bytes / token_count as u64).max(1); // >= 1: no div-by-zero + let by_bytes = (PREWARM_CHUNK_TARGET_BYTES / bytes_per_token) as usize; + by_bytes.clamp(PREWARM_MIN_CHUNK_TOKENS, PREWARM_MAX_CHUNK_TOKENS) +} + +/// Snap a chunk's exclusive token end back to a posting-group boundary so no group +/// straddles chunks. Returns the largest group boundary in `(tok_start, desired_end]`, +/// or the next boundary past an oversized group so it runs as one solo chunk. +fn group_aligned_chunk_end( + starts: &[u32], + token_count: usize, + tok_start: usize, + desired_end: usize, +) -> usize { + let fit = starts + .iter() + .map(|&s| s as usize) + .chain(std::iter::once(token_count)) + .filter(|&b| b > tok_start && b <= desired_end) + .max(); + if let Some(end) = fit { + return end; + } + // Oversized group: extend to its end so it runs as one chunk. + starts + .iter() + .map(|&s| s as usize) + .find(|&b| b > tok_start) + .unwrap_or(token_count) +} + impl InvertedIndex { pub async fn prewarm_with_options(&self, options: &FtsPrewarmOptions) -> Result<()> { let with_position = options.with_position; @@ -2284,50 +2330,84 @@ impl PostingListReader { ) } - fn build_prewarm_posting_lists( - batch: RecordBatch, - offsets: Option>, - max_scores: Option>, - lengths: Option>, - posting_tail_codec: PostingTailCodec, - positions_layout: PositionsLayout, + /// Build posting lists for one chunk's token range from `chunk_batch`, rebasing + /// global offsets to chunk-local rows. Returns `(global token_id, PostingList)` + /// pairs identical to the whole-file path, only bounded to one chunk. + fn build_prewarm_posting_lists_chunk( + chunk_batch: RecordBatch, + chunk: PrewarmChunk<'_>, + ctx: &PrewarmBuildCtx<'_>, ) -> Result> { - let token_count = if let Some(offsets) = offsets.as_ref() { - offsets.len() - } else if let Some(lengths) = lengths.as_ref() { - lengths.len() - } else { - batch.num_rows() - }; - - let mut posting_lists = Vec::with_capacity(token_count); - for token_id in 0..token_count { - let batch = if let Some(offsets) = offsets.as_ref() { - let start = offsets[token_id]; - let end = if token_id + 1 < offsets.len() { - offsets[token_id + 1] + let mut posting_lists = Vec::with_capacity(chunk.token_count); + for local in 0..chunk.token_count { + let global = chunk.tok_start + local; + let row_batch = if let Some(chunk_offsets) = chunk.offsets { + // Legacy v1: rebase global offsets to chunk row 0; the last token + // ends at `chunk.end_row` (no trailing sentinel in chunk_offsets). + let base = chunk_offsets[0]; + let start = chunk_offsets[local] - base; + let end = if local + 1 < chunk_offsets.len() { + chunk_offsets[local + 1] - base } else { - batch.num_rows() + chunk.end_row - base }; - batch.slice(start, end - start) + chunk_batch.slice(start, end - start) } else { - batch.slice(token_id, 1) + // V2: one posting row per token; row `local` within the chunk. + chunk_batch.slice(local, 1) }; - let batch = batch.shrink_to_fit()?; + let row_batch = row_batch.shrink_to_fit()?; let posting_list = Self::posting_list_from_batch_parts( - &batch, - max_scores.as_ref().map(|scores| scores[token_id]), - lengths.as_ref().map(|lengths| lengths[token_id]), - posting_tail_codec, - positions_layout, + &row_batch, + ctx.max_scores.map(|scores| scores[global]), + ctx.lengths.map(|lengths| lengths[global]), + ctx.posting_tail_codec, + ctx.positions_layout, )?; - posting_lists.push((token_id as u32, posting_list)); + posting_lists.push((global as u32, posting_list)); } Ok(posting_lists) } + /// Read the posting rows for token ids `[tok_start, tok_end)` into one RecordBatch. + /// For v2 the token range is the row range; for v1 it's derived from the offsets. + async fn read_chunk_batch( + &self, + tok_start: usize, + tok_end: usize, + with_position: bool, + ) -> Result { + let columns = self.posting_columns(with_position); + let row_range = match &self.metadata { + PostingMetadata::LegacyV1 { offsets, .. } => { + let start = offsets[tok_start]; + let end = offsets + .get(tok_end) + .copied() + .unwrap_or_else(|| self.reader.num_rows()); + start..end + } + PostingMetadata::V2 { .. } => tok_start..tok_end, + }; + let batch = self.reader.read_range(row_range, Some(&columns)).await?; + Ok(batch) + } + async fn prewarm_posting_lists(&self, with_position: bool) -> Result<()> { + self.prewarm_posting_lists_chunked(with_position, None) + .await?; + Ok(()) + } + + /// Stream the partition's posting lists into the cache in bounded token-row chunks + /// (read -> build -> insert -> drop), so peak resident set is ~one chunk. Returns + /// the chunk count (tests assert it split). `chunk_tokens_override` is test-only. + async fn prewarm_posting_lists_chunked( + &self, + with_position: bool, + chunk_tokens_override: Option, + ) -> Result { if with_position && !self.has_positions() { return Err(Error::invalid_input( "cannot prewarm positions for an inverted index that was built without positions; recreate the index with with_position=true".to_owned(), @@ -2339,34 +2419,124 @@ impl PostingListReader { // OnceCells. self.ensure_metadata_loaded().await?; - let read_batch_start = Instant::now(); - let batch = self.read_batch(with_position).await?; - let read_batch_elapsed = read_batch_start.elapsed(); + let state = self.chunk_build_state(); + // With grouping the cache stores one entry per group, so a group's posting + // lists must all be resident at once: align chunk boundaries to whole + // groups. Without grouping, chunks are plain token ranges. + let group_starts = self.group_starts.clone(); + let token_count = self.len(); + let chunk_tokens = chunk_tokens_override + .unwrap_or_else(|| prewarm_chunk_tokens(token_count, self.posting_data_size_bytes())) + .max(1); + + let mut chunk_count = 0usize; + let read_build_start = Instant::now(); + let mut tok_start = 0usize; + while tok_start < token_count { + let mut tok_end = (tok_start + chunk_tokens).min(token_count); + // `tok_start` is always a group boundary; snap `tok_end` back to one too. + if let Some(starts) = group_starts.as_ref() { + tok_end = group_aligned_chunk_end(starts, token_count, tok_start, tok_end); + } + chunk_count += 1; - let (legacy_layout, offsets, max_scores, lengths) = match &self.metadata { + let posting_lists = self + .build_chunk_postings(tok_start, tok_end, with_position, &state) + .await?; + self.publish_chunk_postings( + posting_lists, + group_starts.as_deref(), + tok_start, + tok_end, + token_count, + with_position, + ) + .await; + + tok_start = tok_end; + } + let read_build_elapsed = read_build_start.elapsed(); + + info!( + legacy_layout = self.is_legacy_layout(), + with_position, + token_count, + chunk_count, + chunk_tokens, + read_build_ms = read_build_elapsed.as_secs_f64() * 1000.0, + "posting list prewarm timing" + ); + + Ok(chunk_count) + } + + /// Loop-invariant inputs shared by every chunk build: the metadata vecs + /// (`Arc`d so chunks share them without re-cloning) plus codec/layout. + fn chunk_build_state(&self) -> ChunkBuildState { + let (offsets, max_scores, lengths) = match &self.metadata { PostingMetadata::LegacyV1 { offsets, max_scores, - } => (true, Some(offsets.clone()), max_scores.clone(), None), + } => (Some(offsets.clone()), max_scores.clone(), None), PostingMetadata::V2 { metadata } => ( - false, None, metadata.get().map(|loaded| loaded.max_scores.clone()), metadata.get().map(|loaded| loaded.lengths.clone()), ), }; - let posting_tail_codec = self.posting_tail_codec; - let positions_layout = self.positions_layout; - let populate_start = Instant::now(); + ChunkBuildState { + offsets: offsets.map(Arc::new), + max_scores: max_scores.map(Arc::new), + lengths: lengths.map(Arc::new), + posting_tail_codec: self.posting_tail_codec, + positions_layout: self.positions_layout, + } + } + + /// Read one token-row chunk and build its posting lists off the runtime thread. + /// The large batch is dropped inside the blocking task once built, bounding + /// resident memory to one chunk. + async fn build_chunk_postings( + &self, + tok_start: usize, + tok_end: usize, + with_position: bool, + state: &ChunkBuildState, + ) -> Result> { + let chunk_token_count = tok_end - tok_start; + let chunk_batch = self + .read_chunk_batch(tok_start, tok_end, with_position) + .await?; + + let (chunk_offsets, chunk_end_row) = match state.offsets.as_ref() { + Some(offsets) => { + let end_row = offsets + .get(tok_end) + .copied() + .unwrap_or_else(|| self.reader.num_rows()); + (Some(offsets[tok_start..tok_end].to_vec()), end_row) + } + // V2 doesn't use chunk_end_row (one row per token); pass tok_end. + None => (None, tok_end), + }; + let max_scores = state.max_scores.clone(); + let lengths = state.lengths.clone(); + let posting_tail_codec = state.posting_tail_codec; + let positions_layout = state.positions_layout; let posting_lists = spawn_blocking(move || { - Self::build_prewarm_posting_lists( - batch, - offsets, - max_scores, - lengths, + let ctx = PrewarmBuildCtx { + max_scores: max_scores.as_deref().map(|v| v.as_slice()), + lengths: lengths.as_deref().map(|v| v.as_slice()), posting_tail_codec, positions_layout, - ) + }; + let chunk = PrewarmChunk { + tok_start, + token_count: chunk_token_count, + offsets: chunk_offsets.as_deref(), + end_row: chunk_end_row, + }; + Self::build_prewarm_posting_lists_chunk(chunk_batch, chunk, &ctx) }) .await .map_err(|err| { @@ -2374,60 +2544,95 @@ impl PostingListReader { "Failed to build prewarm posting lists in blocking task: {err}" )) })??; - // Strip positions into their own per-token cache entries first - // (unchanged); the posting cache holds positions-free lists. - let mut postings_by_token = Vec::with_capacity(posting_lists.len()); - for (token_id, mut posting_list) in posting_lists { - if with_position && let Some(positions) = posting_list.take_positions() { - self.index_cache - .insert_with_key(&PositionKey { token_id }, Arc::new(Positions(positions))) - .await; - } - debug_assert_eq!(token_id as usize, postings_by_token.len()); - postings_by_token.push(posting_list); - } - // Populate the same cache keys the read path uses: grouped entries when - // grouping is active (issue #7040), per-token entries otherwise. - match self.group_starts.as_ref() { + // The chunk yields its token range as contiguous ascending ids from + // `tok_start`; the group publish path relies on this to index the lists. + debug_assert_eq!(posting_lists.len(), chunk_token_count); + debug_assert!( + posting_lists + .iter() + .enumerate() + .all(|(i, (token_id, _))| *token_id as usize == tok_start + i) + ); + Ok(posting_lists) + } + + /// Strip positions into their own per-token cache entries (the posting cache + /// holds positions-free lists), then populate the same cache keys the read + /// path uses: grouped entries when grouping is active, per-token entries + /// otherwise. Called once per chunk; the chunk's lists drop on return. + async fn publish_chunk_postings( + &self, + posting_lists: Vec<(u32, PostingList)>, + group_starts: Option<&[u32]>, + tok_start: usize, + tok_end: usize, + token_count: usize, + with_position: bool, + ) { + match group_starts { Some(starts) => { - // The read path derives the last group's `end` from `self.len()`; - // match it here so both produce identical `PostingListGroupKey`s. - debug_assert_eq!(postings_by_token.len(), self.len()); + let mut chunk_postings = Vec::with_capacity(posting_lists.len()); + for (token_id, mut posting_list) in posting_lists { + self.cache_positions(&mut posting_list, token_id, with_position) + .await; + chunk_postings.push(posting_list); + } + // Chunk is group-aligned, so every group starting in it also ends + // in it; `chunk_postings[i]` is token `tok_start + i`. The last + // group's `end` derives from `token_count`, matching the read path + // so both produce identical `PostingListGroupKey`s. for (k, &start) in starts.iter().enumerate() { - let end = starts.get(k + 1).copied().unwrap_or(self.len() as u32); - let group = PostingListGroup::new( - postings_by_token[start as usize..end as usize].to_vec(), - ); + let start_usize = start as usize; + if start_usize < tok_start || start_usize >= tok_end { + continue; + } + let end = starts.get(k + 1).copied().unwrap_or(token_count as u32); + let lo = start_usize - tok_start; + let hi = end as usize - tok_start; + let group = PostingListGroup::new(chunk_postings[lo..hi].to_vec()); self.index_cache .insert_with_key(&PostingListGroupKey { start, end }, Arc::new(group)) .await; } } None => { - for (token_id, posting_list) in postings_by_token.into_iter().enumerate() { + for (token_id, mut posting_list) in posting_lists { + self.cache_positions(&mut posting_list, token_id, with_position) + .await; self.index_cache - .insert_with_key( - &PostingListKey { - token_id: token_id as u32, - }, - Arc::new(posting_list), - ) + .insert_with_key(&PostingListKey { token_id }, Arc::new(posting_list)) .await; } } } - let populate_elapsed = populate_start.elapsed(); + } - info!( - legacy_layout, - with_position, - token_count = self.len(), - read_batch_ms = read_batch_elapsed.as_secs_f64() * 1000.0, - post_read_loop_ms = populate_elapsed.as_secs_f64() * 1000.0, - "posting list prewarm timing" - ); + /// Move a posting list's positions (when present and requested) into the + /// dedicated per-token position cache, leaving the posting list positions-free. + async fn cache_positions( + &self, + posting_list: &mut PostingList, + token_id: u32, + with_position: bool, + ) { + if with_position && let Some(positions) = posting_list.take_positions() { + self.index_cache + .insert_with_key(&PositionKey { token_id }, Arc::new(Positions(positions))) + .await; + } + } - Ok(()) + /// Cheap `invert.lance` size estimate (file length from object metadata, no + /// data read), used only to size prewarm chunks. Falls back to a row-count + /// proxy when the reader can't surface the length (legacy v1). + pub(crate) fn posting_data_size_bytes(&self) -> u64 { + if let Some(size) = self.reader.file_size_bytes() { + return size; + } + // Fallback proxy for readers that don't cache their file length: just needs + // to be monotonic in partition size. + const ESTIMATED_BYTES_PER_ROW: u64 = 16; + (self.reader.num_rows() as u64).saturating_mul(ESTIMATED_BYTES_PER_ROW) } pub(crate) async fn read_batch(&self, with_position: bool) -> Result { @@ -2569,6 +2774,38 @@ impl PostingListReader { } } +/// Loop-invariant state for [`InvertedPartition::build_chunk_postings`]. The +/// metadata vecs are `Arc`d so each chunk's blocking build shares them cheaply. +struct ChunkBuildState { + offsets: Option>>, + max_scores: Option>>, + lengths: Option>>, + posting_tail_codec: PostingTailCodec, + positions_layout: PositionsLayout, +} + +/// Chunk-invariant inputs to [`InvertedPartition::build_prewarm_posting_lists_chunk`]: +/// the per-partition codec/layout and the (shared, whole-partition) metadata +/// slices indexed by global token id. These don't change across chunks. +struct PrewarmBuildCtx<'a> { + max_scores: Option<&'a [f32]>, + lengths: Option<&'a [u32]>, + posting_tail_codec: PostingTailCodec, + positions_layout: PositionsLayout, +} + +/// Per-chunk inputs to [`InvertedPartition::build_prewarm_posting_lists_chunk`]: +/// the token sub-range `[tok_start, tok_start + token_count)` and, for legacy +/// v1, the rebased offset slice plus the chunk's end row. +struct PrewarmChunk<'a> { + tok_start: usize, + token_count: usize, + /// Legacy v1 only: `offsets[tok_start..tok_start+token_count]` (no sentinel). + offsets: Option<&'a [usize]>, + /// Legacy v1 only: global row at which this chunk's posting rows end. + end_row: usize, +} + /// New type just to allow Positions implement DeepSizeOf so it can be put /// in the cache. #[derive(Clone)] @@ -5839,6 +6076,253 @@ mod tests { ); } + /// Prewarming a large partition in multiple chunks must end up holding exactly the + /// same per-token posting lists (doc ids and frequencies) as the whole-file path. + /// Parametrized over layout: the legacy-v1 chunk path rebases global offsets to + /// chunk-local rows, which the v2 one-row-per-token path never exercises. + #[rstest::rstest] + #[case::v1(InvertedListFormatVersion::V1)] + #[case::v2(InvertedListFormatVersion::V2)] + #[tokio::test] + async fn test_prewarm_streams_in_chunks_preserves_content( + #[case] format_version: InvertedListFormatVersion, + ) { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // One partition with many tokens (so it spans many chunks) and several + // docs per token (so each token is more than one posting row). + const NUM_TOKENS: u32 = 20; + const DOCS_PER_TOKEN: u32 = 3; + let posting_tail_codec = format_version.posting_tail_codec(); + let mut builder = InnerBuilder::new_with_format_version( + 0, + false, + TokenSetFormat::default(), + format_version, + ); + // Small groups so the partition spans several; chunks snap to whole groups, + // so several groups are needed to stream in more than one chunk. + builder.group_config = PostingGroupConfig { + target_bytes: 4096, + max_tokens: 4, + }; + // expected[token] = [(doc_id, frequency)] in stored (doc-id) order. + let mut expected: Vec> = Vec::new(); + let mut doc_id = 0u64; + for t in 0..NUM_TOKENS { + builder.tokens.add(format!("tok_{t:03}")); + let mut posting = + PostingListBuilder::new_with_posting_tail_codec(false, posting_tail_codec); + let mut docs = Vec::new(); + for _ in 0..DOCS_PER_TOKEN { + posting.add(doc_id as u32, PositionRecorder::Count(1)); + builder.docs.append(doc_id, 1); + docs.push((doc_id as u32, 1)); + doc_id += 1; + } + expected.push(docs); + builder.posting_lists.push(posting); + } + builder.write(store.as_ref()).await.unwrap(); + + let metadata = std::collections::HashMap::from_iter(vec![ + ( + "partitions".to_owned(), + serde_json::to_string(&vec![0u64]).unwrap(), + ), + ( + "params".to_owned(), + serde_json::to_string(&InvertedIndexParams::default()).unwrap(), + ), + ( + TOKEN_SET_FORMAT_KEY.to_owned(), + TokenSetFormat::default().to_string(), + ), + ( + POSTING_TAIL_CODEC_KEY.to_owned(), + posting_tail_codec.as_str().to_owned(), + ), + ]); + let mut writer = store + .new_index_file(METADATA_FILE, Arc::new(arrow_schema::Schema::empty())) + .await + .unwrap(); + writer.finish_with_metadata(metadata).await.unwrap(); + + let cache = Arc::new(LanceCache::with_capacity(1 << 20)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + let inverted_list = &index.partitions[0].inverted_list; + assert_eq!(inverted_list.len(), NUM_TOKENS as usize); + + // Force a small chunk so the partition deterministically splits; with + // CHUNK_TOKENS < NUM_TOKENS each chunk is bounded below the whole partition. + const CHUNK_TOKENS: usize = 6; + let chunk_count = inverted_list + .prewarm_posting_lists_chunked(false, Some(CHUNK_TOKENS)) + .await + .unwrap(); + + // (1) The partition was streamed in multiple chunks. The exact count is + // group-alignment-dependent (chunks snap to whole groups), so just + // require more than one. + assert!( + chunk_count > 1, + "single partition must be streamed in more than one chunk, got {chunk_count}" + ); + + // (2) Correctness: every token's posting list round-trips with exactly + // the doc ids and frequencies of the whole-file path. + for token_id in 0..NUM_TOKENS { + let actual = inverted_list + .posting_list(token_id, false, &NoOpMetricsCollector) + .await + .unwrap() + .iter() + .map(|(doc_id, freq, _positions)| (doc_id as u32, freq)) + .collect::>(); + assert_eq!( + actual, expected[token_id as usize], + "token {token_id} posting list mismatch after chunked prewarm" + ); + } + } + + /// With positions, the chunked prewarm must strip positions into their own + /// per-token cache entries (leaving the posting cache positions-free) and still + /// round-trip exact doc ids, frequencies, and positions across chunk boundaries. + #[tokio::test] + async fn test_prewarm_streams_in_chunks_with_positions() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let format_version = InvertedListFormatVersion::V2; + let posting_tail_codec = format_version.posting_tail_codec(); + const NUM_TOKENS: u32 = 16; + const DOCS_PER_TOKEN: u32 = 3; + let mut builder = InnerBuilder::new_with_format_version( + 0, + true, + TokenSetFormat::default(), + format_version, + ); + builder.group_config = PostingGroupConfig { + target_bytes: 4096, + max_tokens: 4, + }; + // expected[token] = [(doc_id, frequency, positions)]. + let mut expected: Vec)>> = Vec::new(); + let mut doc_id = 0u64; + for t in 0..NUM_TOKENS { + builder.tokens.add(format!("tok_{t:03}")); + let mut posting = + PostingListBuilder::new_with_posting_tail_codec(true, posting_tail_codec); + let mut docs = Vec::new(); + for _ in 0..DOCS_PER_TOKEN { + let positions = vec![t % 3, t % 3 + 2, t % 3 + 5]; + posting.add( + doc_id as u32, + PositionRecorder::Position(positions.clone().into()), + ); + builder.docs.append(doc_id, positions.len() as u32); + docs.push((doc_id as u32, positions.len() as u32, positions)); + doc_id += 1; + } + expected.push(docs); + builder.posting_lists.push(posting); + } + builder.write(store.as_ref()).await.unwrap(); + + let metadata = std::collections::HashMap::from_iter(vec![ + ( + "partitions".to_owned(), + serde_json::to_string(&vec![0u64]).unwrap(), + ), + ( + "params".to_owned(), + serde_json::to_string(&InvertedIndexParams::default().with_position(true)).unwrap(), + ), + ( + TOKEN_SET_FORMAT_KEY.to_owned(), + TokenSetFormat::default().to_string(), + ), + ( + POSTING_TAIL_CODEC_KEY.to_owned(), + posting_tail_codec.as_str().to_owned(), + ), + ( + POSITIONS_LAYOUT_KEY.to_owned(), + POSITIONS_LAYOUT_SHARED_STREAM_V2.to_owned(), + ), + ( + POSITIONS_CODEC_KEY.to_owned(), + PositionStreamCodec::PackedDelta.as_str().to_owned(), + ), + ]); + let mut writer = store + .new_index_file(METADATA_FILE, Arc::new(arrow_schema::Schema::empty())) + .await + .unwrap(); + writer.finish_with_metadata(metadata).await.unwrap(); + + let cache = Arc::new(LanceCache::with_capacity(1 << 20)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + let inverted_list = &index.partitions[0].inverted_list; + + const CHUNK_TOKENS: usize = 5; + let chunk_count = inverted_list + .prewarm_posting_lists_chunked(true, Some(CHUNK_TOKENS)) + .await + .unwrap(); + assert!( + chunk_count > 1, + "partition must be streamed in more than one chunk, got {chunk_count}" + ); + + for token_id in 0..NUM_TOKENS { + // The prewarmed posting cache entry is positions-free. + let (start, end) = inverted_list.group_range_for_token(token_id).unwrap(); + let group = inverted_list + .index_cache + .get_with_key(&PostingListGroupKey { start, end }) + .await + .unwrap(); + let slot = (token_id - start) as usize; + assert!( + !group.get(slot).unwrap().has_position(), + "token {token_id} posting cache entry must be positions-free after prewarm" + ); + + // Full content (doc ids, frequencies, positions) round-trips; the + // positions come from the dedicated per-token cache prewarm populated. + let actual = inverted_list + .posting_list(token_id, true, &NoOpMetricsCollector) + .await + .unwrap() + .iter() + .map(|(doc_id, freq, positions)| { + (doc_id as u32, freq, positions.unwrap().collect::>()) + }) + .collect::>(); + assert_eq!( + actual, expected[token_id as usize], + "token {token_id} posting list / positions mismatch after chunked prewarm" + ); + } + } + /// IO accounting for the IO-counting stats test below: tracks bytes /// pulled from the posting file so we can assert that the stats path is /// O(1) in num_unique_tokens. @@ -6759,6 +7243,90 @@ mod tests { assert_eq!(row_ids, vec![100]); } + /// Build a multi-partition inverted index in `store` with `num_partitions` + /// partitions, each carrying a handful of tokens/docs. + async fn build_multi_partition_index( + store: &Arc, + num_partitions: u64, + ) -> (Arc, Arc) { + for id in 0..num_partitions { + let mut builder = InnerBuilder::new_with_format_version( + id, + false, + TokenSetFormat::default(), + InvertedListFormatVersion::V1, + ); + // A few distinct tokens per partition so each posting file has real + // content to read and materialize during prewarm. + for t in 0..4u32 { + builder.tokens.add(format!("tok_{id}_{t}")); + let mut posting = PostingListBuilder::new_with_posting_tail_codec( + false, + PostingTailCodec::Fixed32, + ); + let base = id * 1000 + t as u64 * 10; + for d in 0..5u32 { + posting.add(d, PositionRecorder::Count(1)); + builder.docs.append(base + d as u64, 4); + } + builder.posting_lists.push(posting); + } + builder.write(store.as_ref()).await.unwrap(); + } + + let partition_ids: Vec = (0..num_partitions).collect(); + let metadata = std::collections::HashMap::from_iter(vec![ + ( + "partitions".to_owned(), + serde_json::to_string(&partition_ids).unwrap(), + ), + ( + "params".to_owned(), + serde_json::to_string(&InvertedIndexParams::default()).unwrap(), + ), + ( + TOKEN_SET_FORMAT_KEY.to_owned(), + TokenSetFormat::default().to_string(), + ), + ]); + let mut writer = store + .new_index_file(METADATA_FILE, Arc::new(arrow_schema::Schema::empty())) + .await + .unwrap(); + writer.finish_with_metadata(metadata).await.unwrap(); + + // Keep the cache alive and return it: the partition readers hold only a + // WeakLanceCache, so the prewarmed entries vanish if this Arc is dropped. + let cache = Arc::new(LanceCache::with_capacity(1 << 20)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + (index, cache) + } + + /// The prewarm cost estimate must come from cheap object metadata (the + /// posting file length) without reading the posting data, and must be + /// monotonic in the partition's content. + #[tokio::test] + async fn test_posting_data_size_bytes_uses_file_length() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + let (index, _cache) = build_multi_partition_index(&store, 3).await; + for part in &index.partitions { + // File length is reported by object metadata at open time; it must be + // non-trivial for a partition that actually holds postings. + let est = part.inverted_list.posting_data_size_bytes(); + assert!( + est > 0, + "expected a non-zero posting-data size estimate, got {est}" + ); + } + } + #[tokio::test] async fn test_update_preserves_loaded_v2_format_version() -> Result<()> { let src_dir = TempObjDir::default(); diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index f3ba9eb93b1..562945b8f0d 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -370,6 +370,12 @@ impl IndexReader for current_reader::FileReader { fn schema(&self) -> &lance_core::datatypes::Schema { Self::schema(self) } + + fn file_size_bytes(&self) -> Option { + // The manifest records each index file's size and passes it to the reader + // at open, so it's already in metadata here (no extra I/O). + Some(self.metadata().file_size()) + } } #[async_trait] From d8f443c7c25f1612d95e4a55d9a22fed8884cdc6 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 10 Jun 2026 13:11:24 -0700 Subject: [PATCH 077/177] test(compat): search maintenance-op sequences between two arbitrary refs (#7206) Adds a cross-version index compat capability: take two arbitrary refs (version, sha, branch, or tag), age an index under the first, exercise it under the second, and search maintenance-op sequences for panics or correctness divergence -- without hand-coding the failing sequence. This is the test that would have caught ENT-1662 (FTS `load_fst` stale `next_id` panic, fixed in #7200). ## What's here - `venv_manager.py`: a venv per ref -- published releases install the wheel, anything else builds from a git worktree via maturin. Cached and lock-guarded so parallel workers build each ref once. Sub-venv Rust panics are surfaced as the failure. - `compat_sequence.py`: breadth-first search over op sequences (write/delete/compact/optimize) up to a max length, for INVERTED + BTREE, BITMAP, LABEL_LIST, NGRAM, ZONEMAP, BLOOMFILTER. Non-vacuous oracle per kind (FTS count for INVERTED; index-vs-`use_scalar_index=False` for scalar). Setups built once and snapshotted for speed. - `test_fts_sequence.py`: env-driven entry, parallel via xdist sharding. Failures print the plain-English sequence and real error. Default refs are the two most recent stable releases. - `.github/workflows/compat-pair.yml`: on a pull request, ages indexes under the latest release of each of the two previous majors and exercises them under the PR head; `workflow_dispatch` runs one explicit ref pair on demand. ## Test `pytest python/tests/compat/test_fts_sequence.py --run-compat -n 128` reproduces ENT-1662 and runs all scalar kinds green. JSON is the one index kind not yet covered by the sequence search. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- .github/workflows/compat-pair.yml | 78 +++++ .github/workflows/python.yml | 59 ++++ python/python/tests/compat/compat_sequence.py | 296 ++++++++++++++++++ .../tests/compat/test_index_sequence.py | 81 +++++ python/python/tests/compat/venv_manager.py | 232 +++++++++++--- 5 files changed, 704 insertions(+), 42 deletions(-) create mode 100644 .github/workflows/compat-pair.yml create mode 100644 python/python/tests/compat/compat_sequence.py create mode 100644 python/python/tests/compat/test_index_sequence.py diff --git a/.github/workflows/compat-pair.yml b/.github/workflows/compat-pair.yml new file mode 100644 index 00000000000..b7f1f44535b --- /dev/null +++ b/.github/workflows/compat-pair.yml @@ -0,0 +1,78 @@ +# On-demand cross-version index compatibility run between two arbitrary refs. +# +# The PR path lives in python.yml (compat-sequence job), which ages the two previous +# majors into the PR head reusing the prebuilt wheel. This workflow is the manual escape +# hatch for any other pairing: each ref (version, sha, branch, or tag) is provisioned by +# the framework -- a published release installs a wheel, anything else is built from a +# worktree via maturin -- so two arbitrary refs can be compared even when neither has a +# published wheel. The suite ages an index under the writer ref and exercises it under the +# reader, searching maintenance-op sequences for panics or correctness divergence. +name: Compat (index ref pair) + +on: + workflow_dispatch: + inputs: + from_ref: + description: "Writer ref (version / sha / branch). Blank = 2nd most recent release." + required: false + default: "" + to_ref: + description: "Reader ref (version / sha / branch). Blank = most recent release." + required: false + default: "" + kinds: + description: "Comma-separated index kinds (INVERTED,BTREE,...) or 'all'." + required: false + default: "all" + max_length: + description: "Max maintenance-op sequence length to search (deeper = slower)." + required: false + default: "5" + +jobs: + compat-pair: + runs-on: ubuntu-latest + timeout-minutes: 120 + defaults: + run: + working-directory: python + env: + COMPAT_FROM_REF: ${{ inputs.from_ref }} + COMPAT_TO_REF: ${{ inputs.to_ref }} + COMPAT_MAX_LENGTH: ${{ inputs.max_length || '5' }} + steps: + # Full history so arbitrary refs can be checked out for builds. + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.11" + # Toolchain for the build-from-source provisioning path (refs without a wheel). + - uses: actions-rust-lang/setup-rust-toolchain@a0b538fa0b742a6aa35d6e2c169b4bd06d225a98 # v1 + - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + - name: Install build deps + run: | + sudo apt update + sudo apt install -y protobuf-compiler libssl-dev + - name: Install host deps + run: pip install pytest pytest-xdist pyarrow packaging maturin + - name: Resolve kinds + id: kinds + env: + KINDS_IN: ${{ inputs.kinds }} + run: | + if [ -z "$KINDS_IN" ] || [ "$KINDS_IN" = "all" ]; then + echo "value=INVERTED,BTREE,BITMAP,LABEL_LIST,NGRAM,ZONEMAP,BLOOMFILTER" >> "$GITHUB_OUTPUT" + else + echo "value=$KINDS_IN" >> "$GITHUB_OUTPUT" + fi + # Oversubscribe (4x cores): each scenario writes a small dataset to disk and the + # reader spends most of its time in short subprocess round-trips, so a worker waiting + # on disk or the sub-venv pipe overlaps another's compute. + - name: Run compat suite (${{ inputs.from_ref }} -> ${{ inputs.to_ref }}) + env: + COMPAT_KINDS: ${{ steps.kinds.outputs.value }} + run: | + python -m pytest python/tests/compat/test_index_sequence.py \ + --run-compat -n "$(( $(nproc) * 4 ))" -v --no-header diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 973d9a632a7..f9bb3132b38 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -162,6 +162,65 @@ jobs: env: COMPAT_TEMP_VENV: 1 + # Cross-version index maintenance-sequence search (see tests/compat/compat_sequence.py). + # Ages an index under the latest release of each of the two previous majors and exercises + # it under this commit, searching op sequences for panics or correctness divergence. The + # reader (HEAD) reuses the wheel the `linux` job already built rather than recompiling. + # Post-merge only: the search is slower than the rest of the suite, so it runs on pushes + # to main/release rather than blocking every PR (same gating as rust-benchmark). + compat-sequence: + needs: linux + if: github.event_name != 'pull_request' + timeout-minutes: 60 + runs-on: ubuntu-24.04 + name: Index Sequence Compat + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: 3.13 + - name: Download wheels + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 + with: + name: linux-wheels + path: python/wheels + - name: Install host deps + run: pip install pytest pytest-xdist pyarrow packaging + # Age under each of the two previous majors (writer wheels from PyPI) and read under + # this commit's prebuilt wheel. Length 5 is the shallowest depth that reaches the + # ENT-1662 sequence; -n oversubscribes cores since the work is I/O-bound. + - name: Run sequence search (previous two majors -> HEAD) + env: + COMPAT_MAX_LENGTH: "5" + COMPAT_TO_REF: HEAD + COMPAT_PREBUILT_REF: HEAD + run: | + set -euo pipefail + wheel=$(ls "$PWD"/wheels/pylance-*.whl | head -1) + refs=$(PYTHONPATH=python/tests python -c " + from compat.compat_decorator import pylance_stable_versions + latest = {} + for v in pylance_stable_versions(): + latest[v.major] = v + print(' '.join(str(latest[m]) for m in sorted(latest)[-2:])) + ") + echo "reader wheel: $wheel" + echo "writer refs : $refs" + status=0 + for from_ref in $refs; do + echo "::group::$from_ref -> HEAD" + COMPAT_FROM_REF="$from_ref" COMPAT_PREBUILT_WHEEL="$wheel" \ + python -m pytest python/tests/compat/test_index_sequence.py \ + --run-compat -n "$(( $(nproc) * 4 ))" -v --no-header || status=1 + echo "::endgroup::" + done + exit $status + linux-arm: timeout-minutes: 45 runs-on: ubuntu-24.04-arm64-4x diff --git a/python/python/tests/compat/compat_sequence.py b/python/python/tests/compat/compat_sequence.py new file mode 100644 index 00000000000..bec216d6a6f --- /dev/null +++ b/python/python/tests/compat/compat_sequence.py @@ -0,0 +1,296 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Cross-version index maintenance-sequence search. + +Runs on the same per-ref venv substrate as the rest of this package: venv_factory +(venv_manager.py) provisions one venv per ref, so the *setup* half of a sequence runs +under `from_ref` and the *exercise* half under `to_ref` (the version split). After each +run an oracle checks that the reader did not panic and that an index query agrees with a +full (unindexed) scan. This *discovers* cross-version regressions (e.g. ENT-1662) +without hand-coding the triggering sequence. + +The scenario is parameterized by index *kind* so every scalar index type gets the same +aged-lifecycle, cross-version treatment. The oracle runs the same predicate twice -- +normally and with use_scalar_index=False (lance ignores the index) -- and requires +the results to match. If the two query plans are identical the index wasn't used, so the +comparison is skipped rather than failed (uninformative, not a regression). FTS has no +"ignore the index" mode to diff against, so its oracle reconstructs ground truth from a +full scan: tokenize every live row, then require an FTS search for a spread of sampled +terms to return exactly the rows that contain them. The FTS scenarios run under both +on-disk format versions (LANCE_FTS_FORMAT_VERSION 1 and 2), which take different merge +paths. + +The op vocabulary and bounds are deliberately small so the search is runnable; this is +exhaustive over the maintenance-lifecycle grammar up to the configured lengths, not over +every op permutation. +""" + +import itertools +import os +import shutil +from pathlib import Path + +ROWS_PER_WRITE = 200 + +SETUP_TAIL_OPS = ["D", "C", "W"] +EXERCISE_OPS = ["W", "D", "C", "Oa", "Om", "Od"] + +OP_NAMES = { + "W": "write rows", + "I": "create index", + "D": "delete rows", + "C": "compact", + "Oa": "optimize (append)", + "Om": "optimize (merge)", + "Od": "optimize", +} + + +def describe(kind, from_ref, to_ref, setup_ops, exercise_ops, fts_version=None): + """A plain-English description of a scenario for failure output.""" + writer = ", then ".join(OP_NAMES[o] for o in ["W", "I", *setup_ops]) + reader = ", then ".join(OP_NAMES[o] for o in exercise_ops) + tag = f" (fts fmt v{fts_version})" if fts_version is not None else "" + return f"{kind}{tag} ({from_ref} -> {to_ref}): writer [{writer}]; reader [{reader}]" + + +# Index kinds covered by the maintenance-sequence search. +SCALAR_KINDS = ["BTREE", "BITMAP", "LABEL_LIST", "NGRAM", "ZONEMAP", "BLOOMFILTER"] +ALL_KINDS = ["INVERTED", *SCALAR_KINDS] + + +class IndexScenario: + """A picklable, kind-parameterized scenario run across a version split.""" + + def __init__(self, kind, path, setup_ops, exercise_ops): + self.kind = kind + self.path = str(path) + self.setup_ops = list(setup_ops) + self.exercise_ops = list(exercise_ops) + self.next_idx = 0 + + # --- in-venv helpers (only lance + pyarrow available) --- + def _open(self): + import lance + + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + return lance.dataset(self.path, session=session) + + def _batch(self, a, b): + import pyarrow as pa + + idx = list(range(a, b)) + if self.kind == "INVERTED": + # Each row's text mixes tokens of different frequency: a unique term, a + # mid-frequency bucket (~1/7 of rows), and one shared by every row. Sampling + # across that spread exercises postings of varied length. + return pa.table( + {"idx": idx, "key": [f"term{i} bucket{i % 7} shared" for i in idx]} + ) + if self.kind == "LABEL_LIST": + return pa.table({"idx": idx, "key": [[f"l{i % 8}"] for i in idx]}) + if self.kind == "NGRAM": + return pa.table({"idx": idx, "key": [f"w{i % 50}x" for i in idx]}) + # BTREE / BITMAP / ZONEMAP / BLOOMFILTER: integer column + card = 8 if self.kind == "BITMAP" else 50 + key = [i if self.kind == "ZONEMAP" else i % card for i in idx] + return pa.table({"idx": idx, "key": key}) + + def _index_type(self): + return "INVERTED" if self.kind == "INVERTED" else self.kind + + def _oracle_pred(self): + if self.kind == "LABEL_LIST": + return "array_has_any(key, ['l3'])" + if self.kind == "NGRAM": + return "contains(key, 'w3x')" + if self.kind == "ZONEMAP": + return "key >= 100 AND key < 300" + return "key == 3" # BTREE / BITMAP / BLOOMFILTER + + # --- ops --- + def _op_W(self): + import lance + + a, b = self.next_idx, self.next_idx + ROWS_PER_WRITE + self.next_idx = b + tbl = self._batch(a, b) + if not os.path.exists(self.path): + lance.write_dataset(tbl, self.path) # single fragment + else: + self._open().insert(tbl) + + def _op_I(self): + kwargs = {"with_position": True} if self.kind == "INVERTED" else {} + self._open().create_scalar_index("key", self._index_type(), **kwargs) + + def _op_D(self): + # Partial-range delete inside the id space so compaction rewrites and remaps the + # index per-row. + if self.next_idx == 0: + return + lo, hi = self.next_idx // 4, self.next_idx // 2 + if hi > lo: + self._open().delete(f"idx >= {lo} AND idx < {hi}") + + def _op_C(self): + self._open().optimize.compact_files() + + def _op_Oa(self): + self._open().optimize.optimize_indices(num_indices_to_merge=0) + + def _op_Om(self): + self._open().optimize.optimize_indices(num_indices_to_merge=10) + + def _op_Od(self): + self._open().optimize.optimize_indices() + + def _run(self, ops): + for op in ops: + getattr(self, f"_op_{op}")() + + # --- methods invoked across the version split --- + def setup(self): + shutil.rmtree(self.path, ignore_errors=True) + self.next_idx = 0 + self._run(["W", "I"] + self.setup_ops) + return self.next_idx + + def exercise_and_check(self): + self._run(self.exercise_ops) + ds = self._open() + if self.kind == "INVERTED": + # Differential oracle: rebuild the token -> rows map from a full (unindexed) + # scan, then require an FTS search for a spread of sampled terms to return + # exactly those rows. Catches a merge that drops or misassigns postings, not + # just a row-count drift. (Tokens here are alphanumeric and space-separated, + # so a whitespace split reproduces lance's tokenization.) + rows = ds.to_table(columns=["idx", "key"]) + idxs = rows.column("idx").to_pylist() + texts = rows.column("key").to_pylist() + truth = {} + for i, text in zip(idxs, texts): + for tok in text.split(): + truth.setdefault(tok, set()).add(i) + if not truth: + return # everything deleted; nothing to search + vocab = sorted(truth) + # A spread across the vocabulary plus the most common term. + sample = set(vocab[:: max(1, len(vocab) // 6)]) + sample.add(max(truth, key=lambda t: len(truth[t]))) + for term in sorted(sample): + hit = ds.to_table(full_text_query={"query": term, "columns": ["key"]}) + got = set(hit.column("idx").to_pylist()) + want = truth[term] + assert got == want, ( + f"FTS('{term}'): index returned {len(got)} rows, corpus has " + f"{len(want)} (missing {sorted(want - got)[:5]}, " + f"extra {sorted(got - want)[:5]})" + ) + return + # Same column/predicate, index on vs forced off: use_scalar_index=False makes + # lance ignore the index, so the plans differ iff the index is used. If they are + # identical the index wasn't consulted here (the planner chose a scan after + # deletes), so the comparison is vacuous -- skip rather than compare two scans. + pred = self._oracle_pred() + plan_index = ds.scanner(filter=pred).explain_plan(True) + plan_scan = ds.scanner(filter=pred, use_scalar_index=False).explain_plan(True) + if plan_index == plan_scan: + return + got = ds.to_table(filter=pred).num_rows + expected = ds.to_table(filter=pred, use_scalar_index=False).num_rows + assert got == expected, ( + f"{self.kind}: index gave {got} rows, full scan {expected}, for '{pred}'" + ) + + +def generate(max_length): + """Yield every (setup_ops, exercise_ops) whose combined length is 1..max_length, + breadth-first by total length (shorter first). `max_length` is the number of + maintenance ops after the implicit write + create-index, split between the writer + (setup) and reader (exercise) at every position. The order is neutral, so finding a + bug is a real search, not a sorted shortcut. The space grows fast with max_length, + so deeper bugs (ENT-1662 needs length 5) cost more to reach.""" + for total in range(1, max_length + 1): + for setup_len in range(total): # exercise gets total - setup_len >= 1 + for s in itertools.product(SETUP_TAIL_OPS, repeat=setup_len): + for e in itertools.product(EXERCISE_OPS, repeat=total - setup_len): + yield list(s), list(e) + + +def search( + venv_factory, + from_ref, + to_ref, + base_path, + kind, + max_length=4, + shard=0, + num_shards=1, + stop_on_first=True, + fts_version=None, +): + """Search index-maintenance sequences up to `max_length` ops for one `kind`, across + (from_ref -> to_ref). Runs only scenarios in this shard (i % num_shards == shard) so + the space can be split across parallel workers. For INVERTED, `fts_version` ("1" or + "2") pins the on-disk FTS format (LANCE_FTS_FORMAT_VERSION) on both sides; both are + Fst token sets and exercise distinct merge paths. Returns failures; stops on the + first when `stop_on_first`.""" + from_venv = venv_factory.get_venv(from_ref) + to_venv = venv_factory.get_venv(to_ref) + env = {} + if kind == "INVERTED" and fts_version is not None: + env["LANCE_FTS_FORMAT_VERSION"] = str(fts_version) + base = Path(base_path) + failures = [] + # Each setup's aged dataset is built once under from_ref and snapshotted; every + # exercise for that setup runs on a *copy* of it (a dir copy is far cheaper + # than rebuilding the index). Cached per shard, keyed by the setup ops. + snapshots = {} # tuple(setup) -> (snapshot_path, next_idx), or None if setup failed + try: + for i, (setup_tail, exercise) in enumerate(generate(max_length)): + if i % num_shards != shard: + continue + key = tuple(setup_tail) + if key not in snapshots: + snap = base / f"snap_{kind}_{len(snapshots)}" + shutil.rmtree(snap, ignore_errors=True) + builder = IndexScenario(kind, snap, setup_tail, []) + try: + next_idx = from_venv.execute_method(builder, "setup", env) + snapshots[key] = (snap, next_idx) + except Exception as e: + label = describe( + kind, from_ref, to_ref, setup_tail, [], fts_version + ) + err = str(e).strip() + failures.append({"run": i, "sequence": label, "error": err}) + snapshots[key] = None + shutil.rmtree(snap, ignore_errors=True) + if stop_on_first: + break + entry = snapshots[key] + if entry is None: + continue # setup failed; skip its exercises + snap, next_idx = entry + ex_path = base / f"ex_{kind}_{i}" + shutil.rmtree(ex_path, ignore_errors=True) + shutil.copytree(snap, ex_path) + scenario = IndexScenario(kind, ex_path, setup_tail, exercise) + scenario.next_idx = next_idx + label = describe(kind, from_ref, to_ref, setup_tail, exercise, fts_version) + try: + to_venv.execute_method(scenario, "exercise_and_check", env) + except Exception as e: + error = str(e).strip() + failures.append({"run": i, "sequence": label, "error": error}) + if stop_on_first: + break + finally: + shutil.rmtree(ex_path, ignore_errors=True) + finally: + for entry in snapshots.values(): + if entry is not None: + shutil.rmtree(entry[0], ignore_errors=True) + return failures diff --git a/python/python/tests/compat/test_index_sequence.py b/python/python/tests/compat/test_index_sequence.py new file mode 100644 index 00000000000..4d0db694064 --- /dev/null +++ b/python/python/tests/compat/test_index_sequence.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Cross-version index maintenance-sequence search, wired as a compat test. + +For each index kind, generate maintenance-op sequences from a small grammar, run the +setup half under an older writer ref and the exercise half under a newer reader, and +check after each that the reader did not panic and that an index query matches a full +scan. Discovers cross-version regressions (e.g. ENT-1662, under INVERTED) with no +hand-coded sequence. + +Refs and max length are environment-driven so the suite can run between two refs +(versions, commits, or branches): COMPAT_FROM_REF / COMPAT_TO_REF / COMPAT_MAX_LENGTH / +COMPAT_KINDS (comma-separated subset of kinds) / COMPAT_SHARDS (split each kind's search +into this many cases so pytest-xdist (`-n auto`) parallelizes them across cores). +""" + +import os + +import pytest + +from .compat_decorator import pylance_stable_versions +from .compat_sequence import ALL_KINDS, search + + +def _default_refs(): + """The two most recent published stable releases (older -> newer).""" + versions = pylance_stable_versions() + if len(versions) >= 2: + return str(versions[-2]), str(versions[-1]) + return "6.0.1", "7.0.0" # fallback if PyPI is unreachable + + +_default_from, _default_to = _default_refs() +FROM_REF = os.environ.get("COMPAT_FROM_REF") or _default_from +TO_REF = os.environ.get("COMPAT_TO_REF") or _default_to +MAX_LENGTH = int(os.environ.get("COMPAT_MAX_LENGTH", "4")) +KINDS = os.environ.get("COMPAT_KINDS", ",".join(ALL_KINDS)).split(",") +# Many small shards (default 4x cores) so xdist's dynamic scheduler keeps every worker +# busy and an oversubscribed `-n` has work to overlap. +NUM_SHARDS = int(os.environ.get("COMPAT_SHARDS", str((os.cpu_count() or 1) * 4))) + + +def _cases(): + """(kind, fts_version) cases. FTS runs under both on-disk formats (v1, v2); the + scalar kinds are format-agnostic and run once.""" + cases = [] + for kind in KINDS: + if kind == "INVERTED": + cases.extend([("INVERTED", "1"), ("INVERTED", "2")]) + else: + cases.append((kind, None)) + return cases + + +CASES = _cases() +CASE_IDS = [k if v is None else f"{k}-fmtv{v}" for k, v in CASES] + + +@pytest.mark.compat +@pytest.mark.parametrize("kind,fts_version", CASES, ids=CASE_IDS) +@pytest.mark.parametrize("shard", range(NUM_SHARDS)) +def test_index_maintenance_sequence_search( + venv_factory, tmp_path, kind, fts_version, shard +): + failures = search( + venv_factory, + FROM_REF, + TO_REF, + tmp_path, + kind, + max_length=MAX_LENGTH, + shard=shard, + num_shards=NUM_SHARDS, + fts_version=fts_version, + ) + # First line is the failure itself so it shows in pytest's bottom summary; the rest + # (if more than one) appears in the failure body. + assert not failures, "\n".join( + f"{f['sequence']} ==> {f['error']}" for f in failures + ) diff --git a/python/python/tests/compat/venv_manager.py b/python/python/tests/compat/venv_manager.py index 9e16b7e2dc7..c4b23486cd3 100644 --- a/python/python/tests/compat/venv_manager.py +++ b/python/python/tests/compat/venv_manager.py @@ -8,15 +8,39 @@ with specific Lance versions installed. """ +import contextlib +import glob import os import pickle +import re +import shutil import struct import subprocess import sys from pathlib import Path from typing import Any, Optional -from packaging.version import Version +from packaging.version import InvalidVersion, Version + +try: + import fcntl +except ImportError: # pragma: no cover - non-POSIX + fcntl = None + + +@contextlib.contextmanager +def _venv_lock(lock_path: Path): + """Hold an exclusive lock so parallel workers don't race creating the same venv.""" + lock_path.parent.mkdir(parents=True, exist_ok=True) + with open(lock_path, "w") as handle: + if fcntl is not None: + fcntl.flock(handle, fcntl.LOCK_EX) + try: + yield + finally: + if fcntl is not None: + fcntl.flock(handle, fcntl.LOCK_UN) + NAMESPACE_0_6_DEPENDENCY = "lance-namespace<0.7" NAMESPACE_0_7_DEPENDENCY = "lance-namespace>=0.7.2,<0.8" @@ -31,6 +55,47 @@ def _lance_namespace_dependency(pylance_version: str) -> str: return NAMESPACE_0_6_DEPENDENCY +def _is_release_version(ref: str) -> bool: + """A ref is treated as a published release (install a wheel) if it parses as a + version; anything else (commit sha, branch, tag) is built from source.""" + try: + Version(ref) + return True + except InvalidVersion: + return False + + +def _prebuilt_wheel_for(ref: str) -> Optional[str]: + """A prebuilt wheel to install for `ref` instead of building it from source. + + When CI has already built a ref (e.g. the PR head, built once by the Python build + job), COMPAT_PREBUILT_REF names that ref and COMPAT_PREBUILT_WHEEL points at the + wheel (a path or glob). Lets the PR workflow reuse that wheel rather than rebuilding + the reader. Returns None when no prebuilt wheel applies to `ref`. + """ + if os.environ.get("COMPAT_PREBUILT_REF") != ref: + return None + pattern = os.environ.get("COMPAT_PREBUILT_WHEEL") + if not pattern: + return None + matches = sorted(glob.glob(pattern)) + if not matches: + raise FileNotFoundError( + f"COMPAT_PREBUILT_WHEEL={pattern!r} matched no wheel for ref {ref!r}" + ) + return matches[0] + + +def _repo_root() -> Path: + """Lance source checkout holding this test file (used to build refs from source).""" + # .../python/python/tests/compat/venv_manager.py -> repo root is parents[4] + return Path(__file__).resolve().parents[4] + + +def _safe(ref: str) -> str: + return re.sub(r"[^A-Za-z0-9._-]", "_", ref) + + class VenvExecutor: """Manages a virtual environment with a specific Lance version.""" @@ -52,6 +117,8 @@ def __init__(self, version: str, venv_path: Path, persistent: bool = False): self.persistent = persistent self._created = False self._subprocess: Optional[subprocess.Popen] = None + self._stderr_path: Optional[Path] = None + self._stderr_file = None @property def python_path(self) -> Path: @@ -59,54 +126,61 @@ def python_path(self) -> Path: return self.venv_path / "Scripts" / "python.exe" return self.venv_path / "bin" / "python" - def _validate_venv(self) -> bool: - """Check if existing venv is valid and has correct Lance version.""" - if not self.venv_path.exists(): - return False + @property + def _marker_path(self) -> Path: + return self.venv_path / ".compat_ref" + def _validate_venv(self) -> bool: + """A cached venv is reusable if it exists and its recorded ref matches. A marker + file is used (not `pip show`) so source-built commit refs also validate.""" if not self.python_path.exists(): return False - - # Check if pylance is installed with correct version try: - result = subprocess.run( - [str(self.python_path), "-m", "pip", "show", "pylance"], - capture_output=True, - text=True, - timeout=5, - ) - if result.returncode != 0: - return False - - # Parse version from output - for line in result.stdout.splitlines(): - if line.startswith("Version:"): - installed_version = line.split(":", 1)[1].strip() - return installed_version == self.version - - except Exception: + return self._marker_path.read_text().strip() == self.version + except OSError: return False - return False - def create(self): """Create the virtual environment and install the specified Lance version.""" if self._created: return - - # Check if persistent venv already exists and is valid if self.persistent and self._validate_venv(): self._created = True return - # Create virtual environment + # Lock so parallel workers don't build the same venv at once; re-check in the + # lock since another worker may have just finished it. + with _venv_lock(self.venv_path.parent / f".lock_{_safe(self.version)}"): + if not self._validate_venv(): + if self.venv_path.exists(): + shutil.rmtree(self.venv_path) # drop any partial build + subprocess.run( + [sys.executable, "-m", "venv", str(self.venv_path)], + check=True, + capture_output=True, + ) + # Prefer a wheel CI already built for this ref; else a published + # release installs its wheel; else build the ref (commit/branch/tag) + # from source -- so two arbitrary refs can be compared and only the + # ones without a wheel pay a build. + prebuilt = _prebuilt_wheel_for(self.version) + if prebuilt is not None: + self._install_wheel(prebuilt) + elif _is_release_version(self.version): + self._install_release_wheel() + else: + self._build_from_source() + self._marker_path.write_text(self.version) + self._created = True + + def _install_wheel(self, wheel: str): subprocess.run( - [sys.executable, "-m", "venv", str(self.venv_path)], + [str(self.python_path), "-m", "pip", "install", "--quiet", wheel, "pytest"], check=True, capture_output=True, ) - # Install specific pylance version and pytest + def _install_release_wheel(self): subprocess.run( [ str(self.python_path), @@ -131,7 +205,55 @@ def create(self): capture_output=True, ) - self._created = True + def _build_from_source(self): + """Build a wheel for an arbitrary git ref via a worktree + maturin, then install + it. The worktree/build is cached by ref so it is paid at most once.""" + py = str(self.python_path) + src = self.venv_path.parent / f"src_{_safe(self.version)}" + if not src.exists(): + subprocess.run( + [ + "git", + "-C", + str(_repo_root()), + "worktree", + "add", + "--detach", + str(src), + self.version, + ], + check=True, + capture_output=True, + ) + subprocess.run( + [py, "-m", "pip", "install", "--quiet", "maturin", "pytest", "pyarrow"], + check=True, + capture_output=True, + ) + wheels = src / "target" / "compat-wheels" + subprocess.run( + [ + py, + "-m", + "maturin", + "build", + "--release", + "--interpreter", + py, + "-m", + str(src / "python" / "Cargo.toml"), + "--out", + str(wheels), + ], + check=True, + capture_output=True, + ) + wheel = next(wheels.glob("pylance-*.whl")) + subprocess.run( + [py, "-m", "pip", "install", "--quiet", str(wheel)], + check=True, + capture_output=True, + ) def _ensure_subprocess(self): """Ensure the persistent subprocess is running.""" @@ -147,14 +269,35 @@ def _ensure_subprocess(self): tests_dir = Path(__file__).parent.parent env["PYTHONPATH"] = str(tests_dir) + # Capture stderr to a file so a Rust panic (which crashes the runner) can be + # surfaced in the error instead of an opaque "broken pipe". + self._stderr_path = self.venv_path / ".runner_stderr.log" + self._stderr_file = open(self._stderr_path, "w") self._subprocess = subprocess.Popen( [str(self.python_path), "-u", str(runner_script)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=None, # Inherit stderr to see timing messages + stderr=self._stderr_file, env=env, ) + def _last_panic(self) -> str: + """Pull the panic message from the runner's captured stderr, if any.""" + try: + text = self._stderr_path.read_text() + except (OSError, AttributeError): + return "" + lines = text.splitlines() + for i, line in enumerate(lines): + if "panicked at" in line: + # Compact the long path to just "builder.rs:962:57" + loc = line.split("panicked at", 1)[1].strip().rstrip(":") + loc = loc.rsplit("/", 1)[-1] + msg = lines[i + 1].strip() if i + 1 < len(lines) else "" + return f"panic at {loc}: {msg}" if msg else f"panic at {loc}" + tail = [line.strip() for line in lines if line.strip()] + return tail[-1] if tail else "" + def _send_message(self, obj: Any): """Send a length-prefixed pickled message to subprocess.""" data = pickle.dumps(obj) @@ -165,18 +308,19 @@ def _send_message(self, obj: Any): def _receive_message(self) -> Any: """Receive a length-prefixed pickled message from subprocess.""" - # Read 4-byte length header + # Short reads mean the subprocess closed stdout (usually a crash); raise + # EOFError so the caller can surface the panic from captured stderr. length_bytes = self._subprocess.stdout.read(4) if len(length_bytes) < 4: - raise RuntimeError("Failed to read message length from subprocess") + raise EOFError("subprocess closed stdout before sending a message length") length = struct.unpack(">I", length_bytes)[0] # Read message data data = self._subprocess.stdout.read(length) if len(data) < length: - raise RuntimeError( - f"Incomplete message: expected {length} bytes, got {len(data)}" + raise EOFError( + f"incomplete message: expected {length} bytes, got {len(data)}" ) return pickle.loads(data) @@ -234,11 +378,15 @@ def execute_method( raise RuntimeError(error_msg) except (BrokenPipeError, EOFError, struct.error) as e: - # Subprocess died or communication failed - raise RuntimeError( - f"Communication with venv subprocess failed (Lance {self.version}):\n" - f"Error: {e}" - ) + # Subprocess died (usually a Rust panic); flush it, then surface that. + if self._subprocess is not None: + try: + self._subprocess.wait(timeout=2) + except Exception: + pass + panic = self._last_panic() + detail = panic or f"subprocess communication failed: {e}" + raise RuntimeError(f"Lance {self.version}: {detail}") def cleanup(self): """Remove the virtual environment directory and terminate subprocess.""" @@ -295,7 +443,7 @@ def get_venv(self, version: str) -> VenvExecutor: Executor for the specified version """ if version not in self.venvs: - venv_path = self.base_path / f"venv_{version}" + venv_path = self.base_path / f"venv_{_safe(version)}" executor = VenvExecutor(version, venv_path, persistent=self.persistent) executor.create() self.venvs[version] = executor From 0aec53669530d5026a8448c050c58c0a5acff194 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Wed, 10 Jun 2026 20:34:56 +0000 Subject: [PATCH 078/177] chore: release beta version 8.0.0-beta.10 --- .bumpversion.toml | 2 +- Cargo.lock | 52 +++++++++++++++++++-------------------- Cargo.toml | 44 ++++++++++++++++----------------- java/lance-jni/Cargo.lock | 44 ++++++++++++++++----------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 44 ++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 96 insertions(+), 96 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 998832cd870..e1d1a7f2a8c 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.9" +current_version = "8.0.0-beta.10" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index d24bd4ed3ac..03f32545612 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,7 +3146,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4458,7 +4458,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "all_asserts", "approx", @@ -4561,7 +4561,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4609,7 +4609,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrayref", "paste", @@ -4618,7 +4618,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4658,7 +4658,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4691,7 +4691,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4711,7 +4711,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "proc-macro2", "quote", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "all_asserts", "arrow", @@ -4791,7 +4791,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4830,7 +4830,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "datafusion", "geo-traits", @@ -4844,7 +4844,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "approx", "arc-swap", @@ -4920,7 +4920,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-arith", @@ -4968,7 +4968,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "approx", "arrow-array", @@ -4987,7 +4987,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "async-trait", @@ -4999,7 +4999,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-schema", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -5060,9 +5060,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" +checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" dependencies = [ "reqwest 0.12.28", "serde", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -5138,7 +5138,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "proc-macro2", "quote", @@ -5147,7 +5147,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-schema", @@ -5160,7 +5160,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5172,7 +5172,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 3a3d7f7f4c6..6679ed72421 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.9", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.9", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.9", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.9", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.9", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.9", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.9", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.9", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.9", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.9", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.9", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.9", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.9", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.9", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.10", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.10", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.10", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.10", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.10", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.10", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.10", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.10", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.10", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.10", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.10", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.10", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.10", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.10", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.9", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.9", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.9", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.9", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.9", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.10", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.10", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.10", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.10", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.10", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.9", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.10", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = "53.0.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.9", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.10", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 0083a0c0c63..5bd97c17301 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2549,7 +2549,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3749,7 +3749,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3822,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3864,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrayref", "paste", @@ -3873,7 +3873,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3911,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3943,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -3961,7 +3961,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3970,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +4005,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +4049,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arc-swap", "arrow", @@ -4116,7 +4116,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4157,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4193,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4220,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-ipc", @@ -4250,9 +4250,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" +checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" dependencies = [ "reqwest 0.12.28", "serde", @@ -4264,7 +4264,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4316,7 +4316,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 40273380fe4..50a1c025de1 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index d6491f62fd5..c3e4a9e6219 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.9 + 8.0.0-beta.10 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 15a54e6759a..568f5a43fc5 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2899,7 +2899,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4115,7 +4115,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arc-swap", "arrow", @@ -4189,7 +4189,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4231,7 +4231,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrayref", "paste", @@ -4240,7 +4240,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "proc-macro2", "quote", @@ -4337,7 +4337,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4372,7 +4372,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-arith", "arrow-array", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "datafusion", "geo-traits", @@ -4416,7 +4416,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arc-swap", "arrow", @@ -4484,7 +4484,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-arith", @@ -4525,7 +4525,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "async-trait", @@ -4552,7 +4552,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-ipc", @@ -4582,9 +4582,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a09733325812e046cb217d548afc4864dedb59545389d45cd498b3d8ecb0d20" +checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" dependencies = [ "reqwest 0.12.28", "serde", @@ -4596,7 +4596,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow-array", "arrow-buffer", @@ -4611,7 +4611,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", @@ -4650,7 +4650,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6138,7 +6138,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index d8fa41826a3..54f8f6a2d1b 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.9" +version = "8.0.0-beta.10" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 932860f7227e53cf0453b1f944da144aff55a8c5 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 10 Jun 2026 14:07:57 -0700 Subject: [PATCH 079/177] fix(namespace): tolerate new IndexContent fields from reqwest client (#7212) Fixes the cargo lockfile. --- rust/lance-namespace-impls/src/dir.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 8637b6d61f2..053b339a4f3 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -3438,6 +3438,7 @@ impl LanceNamespace for DirectoryNamespace { index_uuid: description.metadata()[0].uuid.to_string(), columns, status: "SUCCEEDED".to_string(), + ..Default::default() }) }) .collect::>>()?; From 739ef902201c90b3f8c6d005762d7fd161782bf2 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Wed, 10 Jun 2026 21:16:06 +0000 Subject: [PATCH 080/177] chore: release beta version 8.0.0-beta.11 --- .bumpversion.toml | 2 +- Cargo.lock | 48 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 40 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 40 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 90 insertions(+), 90 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index e1d1a7f2a8c..7d766a80aff 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.10" +current_version = "8.0.0-beta.11" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 03f32545612..866eb9b4b0e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3146,7 +3146,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4458,7 +4458,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "all_asserts", "approx", @@ -4561,7 +4561,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4609,7 +4609,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrayref", "paste", @@ -4618,7 +4618,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4658,7 +4658,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4691,7 +4691,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4711,7 +4711,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "proc-macro2", "quote", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "all_asserts", "arrow", @@ -4791,7 +4791,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4830,7 +4830,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "datafusion", "geo-traits", @@ -4844,7 +4844,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "approx", "arc-swap", @@ -4920,7 +4920,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-arith", @@ -4968,7 +4968,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "approx", "arrow-array", @@ -4987,7 +4987,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "async-trait", @@ -4999,7 +4999,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-schema", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -5092,7 +5092,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -5138,7 +5138,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "proc-macro2", "quote", @@ -5147,7 +5147,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-schema", @@ -5160,7 +5160,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5172,7 +5172,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 6679ed72421..8cd3b02de7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.10", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.10", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.10", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.10", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.10", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.10", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.10", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.10", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.10", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.10", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.10", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.10", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.10", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.10", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.11", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.11", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.11", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.11", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.11", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.11", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.11", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.11", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.11", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.11", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.11", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.11", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.2" -lance-select = { version = "=8.0.0-beta.10", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.10", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.10", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.10", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.10", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.11", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.11", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.11", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.11", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.11", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.10", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.11", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = "53.0.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.10", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.11", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 5bd97c17301..f4cfc21ec9c 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2549,7 +2549,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3749,7 +3749,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3822,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3864,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrayref", "paste", @@ -3873,7 +3873,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3911,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3943,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -3961,7 +3961,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3970,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +4005,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +4049,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arc-swap", "arrow", @@ -4116,7 +4116,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4157,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4193,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4220,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-ipc", @@ -4264,7 +4264,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4316,7 +4316,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 50a1c025de1..f1144423c0d 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index c3e4a9e6219..e5791f8155d 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.10 + 8.0.0-beta.11 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 568f5a43fc5..879195811cf 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2899,7 +2899,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4115,7 +4115,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arc-swap", "arrow", @@ -4189,7 +4189,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4231,7 +4231,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrayref", "paste", @@ -4240,7 +4240,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "proc-macro2", "quote", @@ -4337,7 +4337,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4372,7 +4372,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-arith", "arrow-array", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "datafusion", "geo-traits", @@ -4416,7 +4416,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arc-swap", "arrow", @@ -4484,7 +4484,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-arith", @@ -4525,7 +4525,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "async-trait", @@ -4552,7 +4552,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-ipc", @@ -4596,7 +4596,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow-array", "arrow-buffer", @@ -4611,7 +4611,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", @@ -4650,7 +4650,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6138,7 +6138,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index 54f8f6a2d1b..f7d6280644a 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.10" +version = "8.0.0-beta.11" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From a2ddebcc234ea7c97997428146ecb3462c7a82e2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 10 Jun 2026 16:11:49 -0700 Subject: [PATCH 081/177] feat: populate enriched IndexContent fields in dir namespace ListTableIndices (#7109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the directory namespace's `list_table_indices` discarded everything from `describe_indices` except `name`/`uuid`/`columns`. This populates the enriched `IndexContent` fields from the `IndexDescription` already in hand, avoiding an N+1 `DescribeTableIndexStats` call per index: - `index_type` ← `index_type()` - `type_url` ← `type_url()` - `num_indexed_rows` ← `rows_indexed()` - `num_unindexed_rows` ← `count_rows() - rows_indexed()`, matching how `describe_table_index_stats` computes it - `size_bytes` ← `total_size_bytes()` (stays `None` for legacy indices without file tracking) - `num_segments` ← `segments().len()` - `created_at` ← min `created_at` across segments, RFC3339-formatted (stays `None` for legacy indices) - `index_version` ← first segment's `index_version` - `index_details` ← `details().ok()` `dir.rs` is the only backend in this repo that constructs `IndexContent`; the REST adapter lives in lance-namespace. `test_list_table_indices` is extended to assert the new fields for both a scalar (`BTree`) and an IVF_FLAT vector index. Bumps `lance-namespace-reqwest-client` to 0.8.4, which also adds a `branch` field to `GetTableTagVersionResponse`; `get_table_tag_version` now reads full `TagContents` via `tags().get()` to populate it. Closes #7101 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 --- Cargo.toml | 2 +- rust/lance-namespace-impls/src/dir.rs | 97 +++++++++++++++++++++++++-- 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8cd3b02de7d..1996e2a2d57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,7 +72,7 @@ lance-linalg = { version = "=8.0.0-beta.11", path = "./rust/lance-linalg" } lance-namespace = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = "0.8.2" +lance-namespace-reqwest-client = "0.8.4" lance-select = { version = "=8.0.0-beta.11", path = "./rust/lance-select" } lance-tokenizer = { version = "=8.0.0-beta.11", path = "./rust/lance-tokenizer" } lance-table = { version = "=8.0.0-beta.11", path = "./rust/lance-table" } diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 053b339a4f3..8859e4bc237 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -3391,6 +3391,11 @@ impl LanceNamespace for DirectoryNamespace { let dataset = self .load_dataset(&table_uri, request.version, "list_table_indices") .await?; + let total_rows = dataset.count_rows(None).await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to count rows for table '{}': {:?}", table_uri, e), + }) + })? as u64; let mut indices = dataset .describe_indices(None) .await @@ -3433,13 +3438,35 @@ impl LanceNamespace for DirectoryNamespace { }) .collect::>>()?; - Ok(IndexContent { + let segments = description.segments(); + let created_at = segments + .iter() + .filter_map(|segment| segment.created_at) + .min() + .map(|ts| ts.to_rfc3339()); + + // `..Default::default()` keeps this tolerant of additive reqwest + // client model changes (see #7212). + #[allow(clippy::needless_update)] + let content = IndexContent { index_name: description.name().to_string(), index_uuid: description.metadata()[0].uuid.to_string(), columns, status: "SUCCEEDED".to_string(), + index_type: Some(description.index_type().to_string()), + type_url: Some(description.type_url().to_string()), + num_indexed_rows: Some(description.rows_indexed() as i64), + num_unindexed_rows: Some( + total_rows.saturating_sub(description.rows_indexed()) as i64, + ), + size_bytes: description.total_size_bytes().map(|size| size as i64), + num_segments: Some(segments.len() as i32), + created_at, + index_version: segments.first().map(|segment| segment.index_version), + index_details: description.details().ok(), ..Default::default() - }) + }; + Ok(content) }) .collect::>>()?; @@ -4345,13 +4372,16 @@ impl LanceNamespace for DirectoryNamespace { .load_dataset(&table_uri, None, "get_table_tag_version") .await?; - let version = dataset + let contents = dataset .tags() - .get_version(&request.tag) + .get(&request.tag) .await .map_err(|e| Self::map_tag_error(e, &request.tag, &table_uri))?; - Ok(GetTableTagVersionResponse::new(version as i64)) + Ok(GetTableTagVersionResponse { + version: contents.version as i64, + branch: contents.branch, + }) } async fn create_table_tag( @@ -6748,7 +6778,7 @@ mod tests { #[tokio::test] async fn test_list_table_indices() { - use lance_namespace::models::ListTableIndicesRequest; + use lance_namespace::models::{CreateTableIndexRequest, ListTableIndicesRequest}; let (namespace, _temp_dir) = create_test_namespace().await; create_scalar_table(&namespace, "users").await; @@ -6777,6 +6807,22 @@ mod tests { assert_eq!(users_id_idx.columns, vec!["id"]); assert_eq!(users_id_idx.status, "SUCCEEDED"); + // Enriched fields populated from the index metadata for a scalar index. + assert_eq!(users_id_idx.index_type.as_deref(), Some("BTree")); + assert!( + users_id_idx + .type_url + .as_deref() + .is_some_and(|s| !s.is_empty()) + ); + assert_eq!(users_id_idx.num_indexed_rows, Some(3)); + assert_eq!(users_id_idx.num_unindexed_rows, Some(0)); + assert_eq!(users_id_idx.num_segments, Some(1)); + assert!(users_id_idx.size_bytes.is_some_and(|size| size > 0)); + assert!(users_id_idx.created_at.is_some()); + assert!(users_id_idx.index_version.is_some()); + assert!(users_id_idx.index_details.is_some()); + let dataset = open_dataset(&namespace, "users").await; let expected_transaction_id = dataset .read_transaction() @@ -6820,6 +6866,44 @@ mod tests { assert_eq!(second_page.indexes.len(), 1); assert_eq!(second_page.indexes[0].index_name, "users_id_idx"); assert!(second_page.page_token.is_none()); + + // A vector index exercises a different type_url, index_type, and details payload. + create_vector_table(&namespace, "vectors").await; + let mut create_index_request = + CreateTableIndexRequest::new("vector".to_string(), "IVF_FLAT".to_string()); + create_index_request.id = Some(vec!["vectors".to_string()]); + create_index_request.name = Some("vector_idx".to_string()); + create_index_request.distance_type = Some("l2".to_string()); + namespace + .create_table_index(create_index_request) + .await + .unwrap(); + + let vector_response = namespace + .list_table_indices(ListTableIndicesRequest { + id: Some(vec!["vectors".to_string()]), + ..Default::default() + }) + .await + .unwrap(); + + assert_eq!(vector_response.indexes.len(), 1); + let vector_idx = &vector_response.indexes[0]; + assert_eq!(vector_idx.index_name, "vector_idx"); + assert_eq!(vector_idx.columns, vec!["vector"]); + assert_eq!(vector_idx.index_type.as_deref(), Some("IVF_FLAT")); + assert!( + vector_idx + .type_url + .as_deref() + .is_some_and(|s| !s.is_empty()) + ); + assert!(vector_idx.num_indexed_rows.is_some()); + assert!(vector_idx.num_unindexed_rows.is_some()); + assert_eq!(vector_idx.num_segments, Some(1)); + assert!(vector_idx.created_at.is_some()); + assert!(vector_idx.index_version.is_some()); + assert!(vector_idx.index_details.is_some()); } #[tokio::test] @@ -12276,6 +12360,7 @@ mod tests { get_req.id = Some(table_id); let resp = namespace.get_table_tag_version(get_req).await.unwrap(); assert_eq!(resp.version, 2); + assert_eq!(resp.branch, None); } #[tokio::test] From 89a6dae445592699f2112d0feb606718e78a4a15 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 10 Jun 2026 18:49:18 -0700 Subject: [PATCH 082/177] perf(index): query BTree lookup batch directly (#7186) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #6802. `BTreeIndex` held both the `page_lookup.lance` batch and a parallel `BTreeLookup` (a `BTreeMap>` plus null-page lists), duplicating every min/max value as owned `ScalarValue`s alongside the Arrow buffers. This PR rewrites `BTreeLookup` to wrap the lookup batch as the single source of truth: - Range searches binary-search the sorted `min` column via `arrow_ord::make_comparator`, then scan forward filtering by `max` and classify `Matches::Some`/`All` — same big-O as before, with packed Arrow buffers instead of scattered tree nodes. Only the small all-null / partial-null page index lists are precomputed. - Equality / `IN` lookups go through a shared `candidate_pages_for_values` that compares query values against the page columns with a native, inlined comparator (no boxed `DynComparator` vtable call per comparison). Dispatch is on the **physical storage type**: logical types backed by the same native are reinterpreted to one path — zero-copy when already that type, otherwise an O(1) `ArrayData` relabel with no value copy — so every `Date32`/`Time32`/`Decimal32`/`IntervalYearMonth` reuses the `i32` path and every `Date64`/`Time64`/`Timestamp`/`Duration`/`Decimal64` reuses the `i64` path, rather than each generating its own. Byte-like columns (`Utf8`/`LargeUtf8`/`Binary`/`LargeBinary`/`FixedSizeBinary`) compare lexicographically via `ArrayAccessor`; intervals with struct natives and booleans fall back to `make_comparator`. Comparators are built once per query and reused across all values. This keeps the scan to ~19 monomorphizations instead of one per logical type. - `BTreeIndex` no longer stores a separate `lookup_batch`; `statistics()` reads bounds from the batch and cache serialization clones the batch out of `page_lookup`. - Float ordering uses `total_cmp` (`ArrowNativeTypeOp::compare` on the native path, `make_comparator` on the fallback), matching the previous `OrderableScalarValue` ordering — so the NaN caveat in the issue is a non-issue. The first commit reverts #7161, which had taken the opposite approach (caching the parsed tree and regenerating the batch on serialize); that machinery is obsolete once the batch is the source of truth. ## Testing - `cargo test -p lance-index --lib` (all pass, incl. NaN ordering, null handling, range/fragment consistency) - `test_btree_lookup_pages_between` covers duplicate `min`s, a null-`min` straddling page, Some/All classification, and empty/inverted ranges. - `test_btree_lookup_pages_eq_bytes` covers the native byte path for `Binary` and `FixedSizeBinary` (e.g. UUID columns). - `test_btree_lookup_pages_eq_temporal` covers the physical-type reinterpret (`Date32`→`i32`, `Timestamp`→`i64`). ## Benchmarks `cargo bench -p lance-index --bench btree` (the existing suite: numeric + string, high/low cardinality, equality / range / `IN`, cached + uncached). Compared against current `main` (which includes #7161) using criterion's baseline significance test — `--load-baseline --baseline main` — so the numbers below are criterion's own bootstrapped change estimate `[lower, upper]` around the median, with its p-value. Config: `sample_size = 10`, `measurement_time = 10s`, default 2% noise threshold. Run on a single dev machine (macOS arm64), so absolute timings are machine-specific; the verdicts are what matter. Equality/`IN` are from the current HEAD; `range_*` exercises `pages_between`, which the equality/IN dispatch work did not touch, so those numbers reflect the same code. ### Wins — low-cardinality and load/deserialize-bound cases No longer rebuilding a `BTreeMap` on load, plus the native comparator inlining: | case | change | p | verdict | |---|---|---|---| | equality/int_low_card/no_cache | −27.5% [−30.7, −24.3] | 0.00 | improved | | equality/string_low_card/no_cache | −26.6% [−29.3, −24.3] | 0.00 | improved | | range_few/int_low_card/no_cache | −23.7% [−26.5, −21.4] | 0.00 | improved | | range_few/string_low_card/no_cache | −23.5% [−24.7, −21.9] | 0.00 | improved | | equality/string_low_card/cached | −20.3% [−22.8, −18.5] | 0.00 | improved | | range_few/string_low_card/cached | −18.0% [−19.4, −16.3] | 0.00 | improved | | equality/int_low_card/cached | −6.64% [−7.90, −5.61] | 0.00 | improved | | range_few/int_low_card/cached | −6.34% [−8.02, −4.49] | 0.00 | improved | ### Hot-path point / `IN` lookups (native physical-type dispatch) An earlier iteration that queried the batch through the boxed `make_comparator` (one vtable call per comparison) regressed warm-cache high-cardinality lookups by up to +44% on a 30-value `IN`. The native comparators remove that. **Vs `main`, all 32 equality+IN benchmarks are improved or at parity — zero regressions.** High-cardinality warm-cache equality is back to parity; warm-cache `IN` is improved: | case | change vs main | p | verdict | |---|---|---|---| | equality/int_unique/cached | parity | — | no change | | equality/string_unique/cached | parity | — | no change | | in_30/int_unique/cached | −9.67% [−10.7, −9.0] | 0.00 | improved | | in_20/int_unique/cached | −3.81% [−4.11, −3.57] | 0.00 | improved | | in_10/int_unique/cached | −2.80% [−4.84, −1.27] | 0.00 | improved | | in_20/string_low_card/cached | −5.40% [−8.39, −2.96] | 0.00 | improved | | in_10/string_low_card/cached | −3.05% [−4.36, −1.94] | 0.00 | improved | | in_30/string_low_card/cached | −2.06% [−3.15, −1.23] | 0.00 | improved | ### Noise calibration To measure the harness noise floor, comparing two runs of **identical code** (same binary behavior) still produces "significant" verdicts: | case (identical code) | change | p | verdict | |---|---|---|---| | in_10/string_unique/cached | −4.29% [−7.41, −1.86] | 0.01 | "improved" | | equality/int_unique/cached | −2.87% [−5.60, −0.68] | 0.02 | "improved" | | equality/int_unique/no_cache | −3.14% [−4.42, −2.02] | 0.00 | "improved" | The code did not change, so these p < 0.05 verdicts are run-to-run variance. Consistent with that, `in_10/int_unique/cached` has read anywhere from −4.2% to +9.3% across runs. At `sample_size = 10` on warm multi-µs benchmarks, swings of ±5% earn p < 0.05 from variance alone, so single-digit-percent deltas on the warm high-cardinality cases are not reliable signal. ### Residual cost `range_*/unique/cached` shows small regressions — e.g. `range_few/string_unique/cached +2.7%`, `range_few/int_unique/cached +3.8%`. `pages_between` (the range path) still uses `make_comparator`; the native physical-type dispatch was added only to the equality/`IN` path. These sit at the edge of the calibrated noise band but are plausibly a small real cost — the deliberate tradeoff for not duplicating every min/max as an owned `ScalarValue` in memory, and a candidate for extending the physical-type dispatch to ranges in a follow-up. All other cases report "no change in performance detected." --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar/btree.rs | 1765 ++++++++++++++++++-------- 1 file changed, 1245 insertions(+), 520 deletions(-) diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 0d278d2d1e2..6128248308e 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -4,7 +4,7 @@ use std::{ any::Any, cmp::Ordering, - collections::{BTreeMap, BinaryHeap, HashMap, HashSet}, + collections::{HashMap, HashSet}, fmt::{Debug, Display}, ops::Bound, sync::Arc, @@ -28,8 +28,17 @@ use crate::{ use crate::{metrics::NoOpMetricsCollector, scalar::registry::TrainingCriteria}; use crate::{pbold, scalar::btree::flat::FlatIndex}; use arrow_arith::numeric::add; -use arrow_array::{Array, RecordBatch, UInt32Array, new_empty_array}; -use arrow_schema::{DataType, Field, Schema, SortOptions}; +use arrow_array::{ + Array, ArrayAccessor, ArrowNativeTypeOp, PrimitiveArray, RecordBatch, UInt32Array, + cast::AsArray, + new_empty_array, + types::{ + ArrowPrimitiveType, Decimal128Type, Decimal256Type, Float16Type, Float32Type, Float64Type, + Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type, UInt32Type, UInt64Type, + }, +}; +use arrow_ord::ord::make_comparator; +use arrow_schema::{DataType, Field, IntervalUnit, Schema, SortOptions}; use async_trait::async_trait; use datafusion::physical_plan::{ ExecutionPlan, SendableRecordBatchStream, @@ -594,46 +603,114 @@ impl Ord for OrderableScalarValue { } } -#[derive(Debug, DeepSizeOf, PartialEq, Eq)] -struct PageRecord { - max: OrderableScalarValue, - page_number: u32, +/// Returns the first index `i` in `[lo, hi)` for which `pred(i)` is `false`. +/// +/// `pred` must be `true` for a (possibly empty) prefix of the range and `false` +/// for the rest, i.e. the range is partitioned by `pred`. +fn partition_point(lo: usize, hi: usize, mut pred: impl FnMut(usize) -> bool) -> usize { + let mut lo = lo; + let mut hi = hi; + while lo < hi { + let mid = lo + (hi - lo) / 2; + if pred(mid) { + lo = mid + 1; + } else { + hi = mid; + } + } + lo } -trait BTreeMapExt { - fn largest_node_less(&self, key: &K) -> Option<(&K, &V)>; +/// Builds a comparator over two array accessors of the same `Ord` item type, +/// matching arrow's NULLs-first ascending order (`null < non-null`, `null == null`). +/// +/// Unlike [`make_comparator`], the returned closure is generic (not boxed), so the +/// element comparison inlines into the scan instead of dispatching through a vtable +/// on every call. +fn accessor_cmp<'a, T, L, R>(left: L, right: R) -> impl Fn(usize, usize) -> Ordering + 'a +where + T: Ord, + L: ArrayAccessor + 'a, + R: ArrayAccessor + 'a, +{ + move |i, j| match (left.is_null(i), right.is_null(j)) { + (true, true) => Ordering::Equal, + (true, false) => Ordering::Less, + (false, true) => Ordering::Greater, + (false, false) => left.value(i).cmp(&right.value(j)), + } } -impl BTreeMapExt for BTreeMap { - fn largest_node_less(&self, key: &K) -> Option<(&K, &V)> { - self.range((Bound::Unbounded, Bound::Excluded(key))) - .next_back() +/// Views `arr` as `PrimitiveArray` for comparison. Zero-copy (shared buffers) +/// when `arr` already has type `K`; otherwise — a logical type whose physical +/// storage is `K::Native`, e.g. `Date32`/`Time32` over `i32` or `Timestamp`/ +/// `Duration` over `i64` — the array data is relabeled to `K` without copying the +/// values, so all such logical types share one comparison path. +fn reinterpret_primitive(arr: &dyn Array) -> Result> { + if let Some(arr) = arr.as_primitive_opt::() { + return Ok(arr.clone()); } + let data = arr + .to_data() + .into_builder() + .data_type(K::DATA_TYPE) + .build() + .map_err(|e| { + Error::internal(format!( + "failed to reinterpret {} as {}: {e}", + arr.data_type(), + K::DATA_TYPE + )) + })?; + Ok(PrimitiveArray::::from(data)) } -/// An in-memory structure that can quickly satisfy scalar queries using a btree of ScalarValue -#[derive(Debug, DeepSizeOf, PartialEq, Eq)] -pub struct BTreeLookup { - tree: BTreeMap>, - /// Pages where the value may be null (does not include all_null_pages), - /// keyed by page number with the exact null count in that page. - null_pages: HashMap, - /// Pages that are entirely null, keyed by page number with the exact null - /// count in that page. - all_null_pages: HashMap, +/// Like [`accessor_cmp`] but for primitive columns, comparing native values with +/// [`ArrowNativeTypeOp::compare`] (total order, so floats match arrow's NaN-last +/// `make_comparator` ordering). +fn primitive_cmp<'a, T>( + left: &'a PrimitiveArray, + right: &'a PrimitiveArray, +) -> impl Fn(usize, usize) -> Ordering + 'a +where + T: ArrowPrimitiveType, +{ + move |i, j| match (left.is_null(i), right.is_null(j)) { + (true, true) => Ordering::Equal, + (true, false) => Ordering::Less, + (false, true) => Ordering::Greater, + (false, false) => left.value(i).compare(right.value(j)), + } } -impl BTreeLookup { - fn empty() -> Self { - Self { - tree: BTreeMap::new(), - null_pages: HashMap::new(), - all_null_pages: HashMap::new(), - } - } +/// Satisfies scalar queries by searching the `page_lookup.lance` batch directly. +/// +/// The batch holds one row per page with columns `min | max | null_count | page_idx`, +/// sorted ascending by `min` with NULLs first (the order the index is trained in). +/// Both query paths binary-search the sorted `min` column for a starting row and +/// scan forward filtering by `max`: +/// +/// - Equality / `IN` (`candidate_pages_for_values`) dispatch on the query's +/// *physical storage type* to a monomorphized, inlined comparator: numerics go +/// through `scan_native` (logical types sharing a native — e.g. `Date32` and +/// `Int32` — fold to one path), byte-likes through `scan_accessor`. Only types +/// without a native fast path (struct-backed intervals, booleans) fall back to the +/// boxed [`make_comparator`] via `scan_fallback`. +/// - Range searches (`pages_between`) currently use [`make_comparator`] directly. +#[derive(Debug, PartialEq, DeepSizeOf)] +pub struct BTreeLookup { + /// One row per page (`min | max | null_count | page_idx`), sorted by `min`. + batch: RecordBatch, + /// Pages with at least one null value (does not include `all_null_pages`). + null_pages: Vec, + /// Pages that are entirely null. + all_null_pages: Vec, + /// Index of the first row whose `max` is non-null. Entirely-null pages sort to + /// the front (NULLs first) and are skipped when searching value ranges. + search_start: usize, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] enum Matches { Some(u32), All(u32), @@ -649,186 +726,461 @@ impl Matches { } impl BTreeLookup { - fn new( - tree: BTreeMap>, - null_pages: HashMap, - all_null_pages: HashMap, - ) -> Self { - Self { - tree, + /// Build a lookup over the `page_lookup.lance` batch. The batch is retained as + /// the source of truth; only the small null-page index lists are precomputed. + fn try_new(batch: RecordBatch) -> Result { + let mut null_pages = Vec::new(); + let mut all_null_pages = Vec::new(); + let mut search_start = batch.num_rows(); + + if batch.num_rows() > 0 { + let maxs = batch.column(1); + let null_counts = batch + .column(2) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup null_count column must be UInt32"))?; + let page_numbers = batch + .column(3) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup page_idx column must be UInt32"))?; + + for idx in 0..batch.num_rows() { + let page_number = page_numbers.values()[idx]; + // An entirely-null page has a null `max`; it is never searched by value. + if maxs.is_null(idx) { + all_null_pages.push(page_number); + continue; + } + if search_start == batch.num_rows() { + search_start = idx; + } + if null_counts.values()[idx] > 0 { + null_pages.push(page_number); + } + } + } else { + search_start = 0; + } + + Ok(Self { + batch, null_pages, all_null_pages, - } + search_start, + }) + } + + fn page_numbers(&self) -> Result<&UInt32Array> { + self.batch + .column(3) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::internal("BTree lookup page_idx column must be UInt32")) } // All pages that could have a value equal to val - fn pages_eq(&self, query: &OrderableScalarValue) -> Vec { + fn pages_eq(&self, query: &OrderableScalarValue) -> Result> { if query.0.is_null() { - self.pages_null() + Ok(self.pages_null()) } else { - self.pages_between((Bound::Included(query), Bound::Excluded(query))) + let query_arr = query.0.to_array_of_size(1)?; + let pages = self.candidate_pages_for_values(query_arr.as_ref())?; + Ok(pages.into_iter().map(Matches::Some).collect()) } } // All pages that could have a value equal to one of the values - fn pages_in(&self, values: impl IntoIterator) -> Vec { - // TODO: Right now we convert all Matches::All into Matches::Some. We could refine this. - // It would improve performance on low cardinality data. - let page_lists = values - .into_iter() - .map(|val| { - self.pages_eq(&val) - .into_iter() - .map(|matches| matches.page_id()) - }) - .collect::>(); - let total_size = page_lists.iter().map(|set| set.len()).sum(); - let mut heap = BinaryHeap::with_capacity(total_size); - for page_list in page_lists { - heap.extend(page_list); + fn pages_in( + &self, + values: impl IntoIterator, + ) -> Result> { + // Equality lookups never produce a full-page (`Matches::All`) match because a + // single value cannot cover an entire page's range, so every candidate is + // `Matches::Some`. Refining this for low-cardinality data is the TODO in + // `pages_between`. + let values = values.into_iter(); + let mut has_null = false; + let mut non_null = Vec::with_capacity(values.size_hint().0); + for val in values { + if val.0.is_null() { + has_null = true; + } else { + non_null.push(val.0); + } } - let mut all_pages = heap.into_sorted_vec(); + + // Build a single array holding every queried value so the comparators are + // constructed once and reused across all of them, rather than per value. + let mut all_pages = if non_null.is_empty() { + Vec::new() + } else { + let query_arr = ScalarValue::iter_to_array(non_null)?; + self.candidate_pages_for_values(query_arr.as_ref())? + }; + if has_null { + all_pages.extend(self.pages_null().into_iter().map(|m| m.page_id())); + } + all_pages.sort_unstable(); all_pages.dedup(); - all_pages.into_iter().map(Matches::Some).collect() + Ok(all_pages.into_iter().map(Matches::Some).collect()) + } + + /// Candidate page numbers (deduped, ascending) for an equality search against + /// every value in `query`. A page is a candidate when its `[min, max]` range + /// could contain the value, i.e. `min <= value <= max`. + /// + /// The comparators are built once over the whole `query` array and reused for + /// each value, so an N-value `IN` costs three comparator constructions instead + /// of three per value. + fn candidate_pages_for_values(&self, query: &dyn Array) -> Result> { + let num_rows = self.batch.num_rows(); + if self.search_start >= num_rows || query.is_empty() { + return Ok(vec![]); + } + + let mins = self.batch.column(0).as_ref(); + let maxs = self.batch.column(1).as_ref(); + let page_ids = self.page_numbers()?.values(); + + // Compare against the page columns with a native, monomorphized comparator + // that inlines, rather than the boxed `DynComparator` from `make_comparator` + // (one vtable call per comparison). Logical types that share a physical + // storage type route to one path via a zero-copy reinterpret, so e.g. every + // date/time/timestamp/duration type reuses the `i32`/`i64` path instead of + // generating its own. Types with no native path (intervals with struct + // natives, booleans, ...) take the `make_comparator` fallback. The query + // array always matches the column type, so its type selects the branch. + use DataType::*; + match query.data_type() { + Int8 => self.scan_native::(mins, maxs, query, page_ids), + Int16 => self.scan_native::(mins, maxs, query, page_ids), + // i32-backed: Int32, Date32, Time32, Decimal32, year-month intervals. + Int32 | Date32 | Time32(_) | Decimal32(_, _) | Interval(IntervalUnit::YearMonth) => { + self.scan_native::(mins, maxs, query, page_ids) + } + // i64-backed: Int64, Date64, Time64, Timestamp, Duration, Decimal64. + Int64 | Date64 | Time64(_) | Timestamp(_, _) | Duration(_) | Decimal64(_, _) => { + self.scan_native::(mins, maxs, query, page_ids) + } + UInt8 => self.scan_native::(mins, maxs, query, page_ids), + UInt16 => self.scan_native::(mins, maxs, query, page_ids), + UInt32 => self.scan_native::(mins, maxs, query, page_ids), + UInt64 => self.scan_native::(mins, maxs, query, page_ids), + Float16 => self.scan_native::(mins, maxs, query, page_ids), + Float32 => self.scan_native::(mins, maxs, query, page_ids), + Float64 => self.scan_native::(mins, maxs, query, page_ids), + Decimal128(_, _) => self.scan_native::(mins, maxs, query, page_ids), + Decimal256(_, _) => self.scan_native::(mins, maxs, query, page_ids), + Utf8 => Ok(self.scan_accessor( + mins.as_string::(), + maxs.as_string::(), + query.as_string::(), + page_ids, + )), + LargeUtf8 => Ok(self.scan_accessor( + mins.as_string::(), + maxs.as_string::(), + query.as_string::(), + page_ids, + )), + Binary => Ok(self.scan_accessor( + mins.as_binary::(), + maxs.as_binary::(), + query.as_binary::(), + page_ids, + )), + LargeBinary => Ok(self.scan_accessor( + mins.as_binary::(), + maxs.as_binary::(), + query.as_binary::(), + page_ids, + )), + FixedSizeBinary(_) => Ok(self.scan_accessor( + mins.as_fixed_size_binary(), + maxs.as_fixed_size_binary(), + query.as_fixed_size_binary(), + page_ids, + )), + _ => self.scan_fallback(mins, maxs, query, page_ids), + } + } + + /// Native-comparator equality scan for a primitive physical type `K`. The page + /// columns and `query` are reinterpreted to `PrimitiveArray` (zero-copy when + /// already that type) and compared with [`primitive_cmp`]. + fn scan_native( + &self, + mins: &dyn Array, + maxs: &dyn Array, + query: &dyn Array, + page_ids: &[u32], + ) -> Result> { + let mins = reinterpret_primitive::(mins)?; + let maxs = reinterpret_primitive::(maxs)?; + let query = reinterpret_primitive::(query)?; + Ok(self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + primitive_cmp(&mins, &query), + primitive_cmp(&maxs, &query), + primitive_cmp(&mins, &mins), + )) + } + + /// Native-comparator equality scan for byte-like columns (`Utf8`/`Binary`/ + /// `FixedSizeBinary` and their large variants), compared lexicographically via + /// [`accessor_cmp`]. + fn scan_accessor(&self, mins: A, maxs: A, query: A, page_ids: &[u32]) -> Vec + where + T: Ord, + A: ArrayAccessor + Copy, + { + self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + accessor_cmp(mins, query), + accessor_cmp(maxs, query), + accessor_cmp(mins, mins), + ) + } + + /// Fallback equality scan for types without a native path (intervals with struct + /// natives, booleans, ...), using arrow's boxed `make_comparator`. + fn scan_fallback( + &self, + mins: &dyn Array, + maxs: &dyn Array, + query: &dyn Array, + page_ids: &[u32], + ) -> Result> { + // The batch is sorted ascending by `min` with NULLs first; compare the query + // values the same way so the binary searches stay consistent. + let opts = SortOptions { + descending: false, + nulls_first: true, + }; + let cmp_min = make_comparator(mins, query, opts)?; + let cmp_max = make_comparator(maxs, query, opts)?; + let cmp_min_min = make_comparator(mins, mins, opts)?; + Ok(self.scan_equality_pages( + query.len(), + page_ids, + |idx| maxs.is_null(idx), + cmp_min, + cmp_max, + cmp_min_min, + )) + } + + /// Binary-search + forward-scan the page batch for equality candidates. + /// + /// Monomorphized over the comparator closures so a typed-native comparator + /// inlines (no per-call vtable dispatch). The closures encode NULLs-first, + /// ascending order: + /// * `max_is_null(i)` — whether page `i`'s `max` is null (an all-null page) + /// * `cmp_min(i, j)` — page `i`'s `min` vs query value `j` + /// * `cmp_max(i, j)` — page `i`'s `max` vs query value `j` + /// * `cmp_min_min(i, anchor)` — two page `min`s, to expand left onto a straddle + fn scan_equality_pages( + &self, + num_query: usize, + page_ids: &[u32], + max_is_null: impl Fn(usize) -> bool, + cmp_min: impl Fn(usize, usize) -> Ordering, + cmp_max: impl Fn(usize, usize) -> Ordering, + cmp_min_min: impl Fn(usize, usize) -> Ordering, + ) -> Vec { + let num_rows = self.batch.num_rows(); + // High-cardinality lookups hit ~one page per value; presize to avoid the + // element-by-element `RawVec` growth that profiling flagged. + let mut pages = Vec::with_capacity(num_query); + for j in 0..num_query { + // Start row: peek a little to the left of the value. A query for 7 must + // still reach a page like [5, 10], so we include every page whose `min` + // equals the largest `min` strictly less than the value. + let p = partition_point(0, num_rows, |i| cmp_min(i, j) == Ordering::Less); + let start = if p == 0 { + self.search_start + } else { + let anchor = p - 1; + partition_point(0, p, |i| cmp_min_min(i, anchor) == Ordering::Less) + } + .max(self.search_start); + + // End row: pages whose `min` exceeds the value cannot match. + let end = partition_point(start, num_rows, |i| cmp_min(i, j) != Ordering::Greater); + + // The window splits at `p` (first row with `min >= value`): + // * `[start, p)` — the peek-left/straddle region (`min < value`). A page + // here matches only if its `max` reaches the value, so it needs the + // filter, and it may include a null-`min`/null-`max` straddle page. + // * `[p, end)` — rows with `min == value`. These always match (`max >= + // min == value`) and can't have a null `max` (all-null pages sort to + // the front, before `search_start <= start`), so we copy them in one + // slice instead of pushing per row. + let bulk_start = p.max(start); + for (offset, &page_id) in page_ids[start..bulk_start].iter().enumerate() { + let idx = start + offset; + // All-null pages are only matched by IS NULL queries. + if max_is_null(idx) { + continue; + } + // Candidate when the page's `max` reaches the value (`max >= value`). + if cmp_max(idx, j) != Ordering::Less { + pages.push(page_id); + } + } + pages.extend_from_slice(&page_ids[bulk_start..end]); + } + + pages.sort_unstable(); + pages.dedup(); + pages } // All pages that could have a value in the range fn pages_between( &self, range: (Bound<&OrderableScalarValue>, Bound<&OrderableScalarValue>), - ) -> Vec { - // We need to grab a little bit left of the given range because the query might be 7 - // and the first page might be something like 5-10. - let lower_bound = match range.0 { - Bound::Unbounded => Bound::Unbounded, - // It doesn't matter if the bound is exclusive or inclusive. We are going to grab - // the first node whose min is strictly less than the given bound. Then we grab - // all nodes greater than or equal to that - // - // We have to peek a bit to the left because we might have something like a lower - // bound of 7 and there is a page [5-10] we want to search for. - Bound::Included(lower) => self - .tree - .largest_node_less(lower) - .map(|val| Bound::Included(val.0)) - .unwrap_or(Bound::Unbounded), - Bound::Excluded(lower) => self - .tree - .largest_node_less(lower) - .map(|val| Bound::Included(val.0)) - .unwrap_or(Bound::Unbounded), + ) -> Result> { + let num_rows = self.batch.num_rows(); + // No searchable (non-all-null) pages. + if self.search_start >= num_rows { + return Ok(vec![]); + } + + let mins = self.batch.column(0).as_ref(); + let maxs = self.batch.column(1).as_ref(); + let page_numbers = self.page_numbers()?; + + // The batch is sorted ascending by `min` with NULLs first; compare bounds + // the same way so the binary searches and the null `min` of a straddling + // page are handled consistently. + let opts = SortOptions { + descending: false, + nulls_first: true, + }; + // Bounds become 1-row arrays of the column type so arrow's type-dispatched + // comparator can compare them against the `min`/`max` columns. + let lower_arr = match range.0 { + Bound::Unbounded => None, + Bound::Included(v) | Bound::Excluded(v) => Some(v.0.to_array_of_size(1)?), }; - let upper_bound = match range.1 { - Bound::Unbounded => Bound::Unbounded, - Bound::Included(upper) => Bound::Included(upper), - // Even if the upper bound is excluded we need to include it on an [x, x) query. This is because the - // query might be [x, x). Our lower bound might find some [a-x] bucket and we still - // want to include any [x, z] bucket. - // - // We could be slightly more accurate here and only include the upper bound if the lower bound - // is defined, inclusive, and equal to the upper bound. However, let's keep it simple for now. This - // should only affect the probably rare case that our query is a true range query and the value - // matches an upper bound. This will all be moot if/when we merge pages. - Bound::Excluded(upper) => Bound::Included(upper), + let upper_arr = match range.1 { + Bound::Unbounded => None, + Bound::Included(v) | Bound::Excluded(v) => Some(v.0.to_array_of_size(1)?), }; - match (lower_bound, upper_bound) { - (Bound::Excluded(lower), Bound::Excluded(upper)) - | (Bound::Excluded(lower), Bound::Included(upper)) - | (Bound::Included(lower), Bound::Excluded(upper)) => { - // It's not really clear what (Included(5), Excluded(5)) would mean so we - // interpret it as an empty range which matches rust's BTreeMap behavior - if lower >= upper { - return vec![]; + // Start row: peek a little to the left of the lower bound. A query for 7 + // must still reach a page like [5, 10], so we include every page whose + // `min` equals the largest `min` strictly less than the lower bound. + let start = match &lower_arr { + None => self.search_start, + Some(lower) => { + let cmp = make_comparator(mins, lower.as_ref(), opts)?; + // first row with min >= lower + let p = partition_point(0, num_rows, |i| cmp(i, 0) == Ordering::Less); + if p == 0 { + self.search_start + } else { + // first row sharing the straddling page's `min` + let straddle = mins.slice(p - 1, 1); + let cmp = make_comparator(mins, straddle.as_ref(), opts)?; + partition_point(0, p, |i| cmp(i, 0) == Ordering::Less) } } - (Bound::Included(lower), Bound::Included(upper)) => { - if lower > upper { - return vec![]; - } + } + .max(self.search_start); + + // End row: pages whose `min` exceeds the upper bound cannot match. The + // upper bound is treated as inclusive even when the query bound is + // exclusive, so an [x, x) query still reaches a page whose `min` == x. + let end = match &upper_arr { + None => num_rows, + Some(upper) => { + let cmp = make_comparator(mins, upper.as_ref(), opts)?; + partition_point(start, num_rows, |i| cmp(i, 0) != Ordering::Greater) } - _ => {} + }; + + if start >= end { + return Ok(vec![]); } + // Comparators reused across the candidate rows. + let cmp_max_lower = lower_arr + .as_ref() + .map(|l| make_comparator(maxs, l.as_ref(), opts)) + .transpose()?; + let cmp_min_lower = lower_arr + .as_ref() + .map(|l| make_comparator(mins, l.as_ref(), opts)) + .transpose()?; + let cmp_max_upper = upper_arr + .as_ref() + .map(|u| make_comparator(maxs, u.as_ref(), opts)) + .transpose()?; + let mut matches = Vec::new(); + for idx in start..end { + // All-null pages are only matched by IS NULL queries. + if maxs.is_null(idx) { + continue; + } - for (min, page_records) in self.tree.range((lower_bound, upper_bound)) { - for page_record in page_records { - match lower_bound { - Bound::Unbounded => {} - Bound::Included(lower) => { - if page_record.max.cmp(lower) == Ordering::Less { - continue; - } - } - Bound::Excluded(lower) => { - if page_record.max.cmp(lower) != Ordering::Greater { - continue; - } - } - } - // At this point we know the page record matches at least some values. - // We should test to see if ALL values are a match. + // Candidate filter: the page's `max` reaches the lower bound. + let lower_ok = match (range.0, &cmp_max_lower) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Less, // max >= lower + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Greater, // max > lower + _ => unreachable!("lower bound and its comparator are constructed together"), + }; + if !lower_ok { + continue; + } - if min.0.is_null() || page_record.max.0.is_null() { - // If there are nulls then we just use Matches::Some - matches.push(Matches::Some(page_record.page_number)); - continue; - } + let page_number = page_numbers.values()[idx]; - match range.0 { - // range.0 < X therefore if the smallest value is not strictly greater than - // the lower bound we only have partial match - Bound::Excluded(lower) => { - if min.cmp(lower) != Ordering::Greater { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - // range.0 <= X therefore if the smallest value is not greater than or equal - // to the lower bound we only have partial match - Bound::Included(lower) => { - if min.cmp(lower) == Ordering::Less { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - Bound::Unbounded => {} - } - match range.1 { - // X < range.1 therefore if the largest value is not strictly less than - // the upper bound we only have partial match - Bound::Excluded(upper) => { - if page_record.max.cmp(upper) != Ordering::Less { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - // X <= range.1 therefore if the largest value is not less than or equal to - // the upper bound we only have partial match - Bound::Included(upper) => { - if page_record.max.cmp(upper) == Ordering::Greater { - matches.push(Matches::Some(page_record.page_number)); - continue; - } - } - Bound::Unbounded => {} - } - // The min is greater than the lower bound and the max is less than the upper bound - // so we have a full match - matches.push(Matches::All(page_record.page_number)); + // A page with a null `min` straddles the NULL/non-NULL boundary, so it + // is only ever a partial match. + if mins.is_null(idx) { + matches.push(Matches::Some(page_number)); + continue; + } + + // Full match requires the page to sit entirely within the query range. + let lower_full = match (range.0, &cmp_min_lower) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Less, // min >= lower + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Greater, // min > lower + _ => unreachable!("lower bound and its comparator are constructed together"), + }; + let upper_full = match (range.1, &cmp_max_upper) { + (Bound::Unbounded, _) => true, + (Bound::Included(_), Some(cmp)) => cmp(idx, 0) != Ordering::Greater, // max <= upper + (Bound::Excluded(_), Some(cmp)) => cmp(idx, 0) == Ordering::Less, // max < upper + _ => unreachable!("upper bound and its comparator are constructed together"), + }; + if lower_full && upper_full { + matches.push(Matches::All(page_number)); + } else { + matches.push(Matches::Some(page_number)); } } - matches + Ok(matches) } fn pages_null(&self) -> Vec { self.null_pages - .keys() + .iter() .copied() .map(Matches::Some) - .chain(self.all_null_pages.keys().copied().map(Matches::All)) + .chain(self.all_null_pages.iter().copied().map(Matches::All)) .collect() } } @@ -1008,148 +1360,25 @@ impl CacheKey for BTreePageKey { } } -fn parse_btree_lookup(data: &RecordBatch) -> Result<(Arc, DataType)> { - let data_type = data.column(0).data_type().clone(); - if data.num_rows() == 0 { - return Ok((Arc::new(BTreeLookup::empty()), data_type)); - } - - let mins = data.column(0); - let maxs = data.column(1); - let null_counts = data - .column(2) - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::internal("BTree lookup null_count column must be UInt32"))?; - let page_numbers = data - .column(3) - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::internal("BTree lookup page_idx column must be UInt32"))?; - - let mut map = BTreeMap::>::new(); - let mut null_pages = HashMap::::new(); - let mut all_null_pages = HashMap::::new(); - - for idx in 0..data.num_rows() { - let min = OrderableScalarValue(ScalarValue::try_from_array(&mins, idx)?); - let max = OrderableScalarValue(ScalarValue::try_from_array(&maxs, idx)?); - let null_count = null_counts.values()[idx]; - let page_number = page_numbers.values()[idx]; - - // If the page is entirely null don't even bother putting it in the tree. - if max.0.is_null() { - all_null_pages.insert(page_number, null_count); - continue; - } else { - map.entry(min) - .or_default() - .push(PageRecord { max, page_number }); - } - - if null_count > 0 { - null_pages.insert(page_number, null_count); - } - } - - let last_max = ScalarValue::try_from_array(&maxs, data.num_rows() - 1)?; - map.entry(OrderableScalarValue(last_max)).or_default(); - - Ok(( - Arc::new(BTreeLookup::new(map, null_pages, all_null_pages)), - data_type, - )) -} - -fn btree_lookup_as_batch(lookup: &BTreeLookup, data_type: &DataType) -> Result { - let mut mins = Vec::new(); - let mut maxs = Vec::new(); - let mut null_counts = Vec::new(); - let mut page_numbers = Vec::new(); - - // Keep all-null rows first so the regenerated lookup batch remains sorted - // with NULLs before non-NULL values. `parse_btree_lookup` adds a sentinel - // from the final row's max value, and that sentinel must not be NULL when - // the lookup also has non-null pages. - let null_value = ScalarValue::try_new_null(data_type)?; - let mut all_null_pages = lookup.all_null_pages.iter().collect::>(); - all_null_pages.sort_by_key(|(page_number, _)| **page_number); - for (page_number, null_count) in all_null_pages { - mins.push(null_value.clone()); - maxs.push(null_value.clone()); - null_counts.push(*null_count); - page_numbers.push(*page_number); - } - - // Preserve the exact null_count from the lookup batch. Query execution only - // needs `null_count > 0` to route IS NULL queries, but the lookup wire - // format stores exact counts and future costing / selectivity logic may use - // them. - for (min, page_records) in &lookup.tree { - for page_record in page_records { - mins.push(min.0.clone()); - maxs.push(page_record.max.0.clone()); - null_counts.push( - lookup - .null_pages - .get(&page_record.page_number) - .copied() - .unwrap_or(0), - ); - page_numbers.push(page_record.page_number); - } - } - - let min_array = if mins.is_empty() { - new_empty_array(data_type) - } else { - ScalarValue::iter_to_array(mins)? - }; - let max_array = if maxs.is_empty() { - new_empty_array(data_type) - } else { - ScalarValue::iter_to_array(maxs)? - }; - - let schema = Arc::new(Schema::new(vec![ - Field::new("min", data_type.clone(), true), - Field::new("max", data_type.clone(), true), - Field::new("null_count", DataType::UInt32, false), - Field::new("page_idx", DataType::UInt32, false), - ])); - Ok(RecordBatch::try_new( - schema, - vec![ - min_array, - max_array, - Arc::new(UInt32Array::from_iter_values(null_counts)), - Arc::new(UInt32Array::from_iter_values(page_numbers)), - ], - )?) -} - /// The serializable state of a [`BTreeIndex`]. /// -/// A `BTreeIndex` also holds non-serializable infrastructure such as an -/// `IndexStore`, a cache handle, and an optional fragment-reuse index. This -/// state keeps only the parsed lookup tree and small routing metadata needed to -/// rebuild the index without re-reading from blob storage. Once this state is -/// resident in memory, reconstructing a `BTreeIndex` does not reparse the -/// lookup. Restoring this state from a persistent cache still parses the -/// embedded IPC lookup payload into this parsed form. +/// A `BTreeIndex` holds non-serializable infrastructure (an `IndexStore`, a +/// cache handle, a fragment-reuse index). `BTreeIndexState` captures just the +/// data needed to rebuild it: the `page_lookup.lance` batch (from which +/// `BTreeIndex::try_from_serialized` reconstructs the in-memory lookup with +/// no IO) plus the page batch size and range-partition map. #[derive(Debug, Clone)] struct BTreeIndexState { - page_lookup: Arc, - data_type: DataType, + lookup_batch: RecordBatch, batch_size: u64, ranges_to_files: Option>>, } impl DeepSizeOf for BTreeIndexState { fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { - // `ranges_to_files` is tiny and `RangeInclusiveMap` is not `DeepSizeOf`. - // The parsed lookup tree is the resident memory we need to charge. - self.page_lookup.deep_size_of_children(context) + // `ranges_to_files` is tiny and `RangeInclusiveMap` is not `DeepSizeOf`; + // the lookup batch dominates, matching how `BTreeIndex` accounts for itself. + self.lookup_batch.deep_size_of_children(context) } } @@ -1160,28 +1389,28 @@ impl BTreeIndexState { index_cache: &LanceCache, frag_reuse_index: Option>, ) -> Result> { - let index = BTreeIndex::new( - self.page_lookup.clone(), + let index = BTreeIndex::try_from_serialized( + self.lookup_batch.clone(), store, - self.data_type.clone(), - WeakLanceCache::from(index_cache), + index_cache, self.batch_size, self.ranges_to_files.clone(), frag_reuse_index, - ); + )?; Ok(Arc::new(index) as Arc) } } impl CacheCodecImpl for BTreeIndexState { - /// Wire format (cache-internal, no stability guarantee): + /// Wire format (no stability guarantees yet — the cache is rebuilt from + /// source on any version mismatch): /// ```text /// u64 batch_size (LE) /// u8 has_ranges (0 = None, 1 = Some) /// if has_ranges: /// u32 entry_count (LE) /// per entry: u32 start | u32 end | u32 offset | u32 path_len | path bytes - /// lookup batch regenerated from the parsed BTreeLookup (Arrow IPC stream) + /// lookup batch (Arrow IPC stream) /// ``` fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { writer.write_all(&self.batch_size.to_le_bytes())?; @@ -1205,8 +1434,7 @@ impl CacheCodecImpl for BTreeIndexState { } } } - let lookup_batch = btree_lookup_as_batch(&self.page_lookup, &self.data_type)?; - write_ipc_stream(&lookup_batch, writer)?; + write_ipc_stream(&self.lookup_batch, writer)?; Ok(()) } @@ -1239,10 +1467,8 @@ impl CacheCodecImpl for BTreeIndexState { } }; let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; - let (page_lookup, data_type) = parse_btree_lookup(&lookup_batch)?; Ok(Self { - page_lookup, - data_type, + lookup_batch, batch_size, ranges_to_files, }) @@ -1341,7 +1567,8 @@ pub struct BTreeIndex { impl DeepSizeOf for BTreeIndex { fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { // We don't include the index cache, or anything stored in it. For example: - // sub_index and fri. + // sub_index and fri. `page_lookup` owns the lookup batch (the single source + // of truth), so accounting for it covers the lookup data. self.page_lookup.deep_size_of_children(context) + self.store.deep_size_of_children(context) } } @@ -1437,7 +1664,9 @@ impl BTreeIndex { ranges_to_files: Option>>, frag_reuse_index: Option>, ) -> Result { - let (page_lookup, data_type) = parse_btree_lookup(&data)?; + let data_type = data.column(0).data_type().clone(); + let page_lookup = Arc::new(BTreeLookup::try_new(data)?); + Ok(Self::new( page_lookup, store, @@ -1746,18 +1975,25 @@ impl Index for BTreeIndex { } fn statistics(&self) -> Result { - let min = self - .page_lookup - .tree - .first_key_value() - .map(|(k, _)| k.clone()); - let max = self - .page_lookup - .tree - .last_key_value() - .map(|(k, _)| k.clone()); + let lookup = &self.page_lookup; + let batch = &lookup.batch; + let num_rows = batch.num_rows(); + // The batch is sorted by `min`, so the smallest searchable value is the + // `min` of the first non-all-null page and the largest is the `max` of the + // last page. + let (min, max) = if lookup.search_start >= num_rows { + (None, None) + } else { + let min = OrderableScalarValue(ScalarValue::try_from_array( + batch.column(0), + lookup.search_start, + )?); + let max = + OrderableScalarValue(ScalarValue::try_from_array(batch.column(1), num_rows - 1)?); + (Some(min), Some(max)) + }; serde_json::to_value(&BTreeStatistics { - num_pages: self.page_lookup.tree.len() as u32, + num_pages: num_rows as u32, min, max, }) @@ -1804,7 +2040,7 @@ impl ScalarIndex for BTreeIndex { "full text search is not supported for BTree index, build a inverted index for it", )); } - SargableQuery::IsNull() => self.page_lookup.pages_null(), + SargableQuery::IsNull() => Ok(self.page_lookup.pages_null()), SargableQuery::LikePrefix(prefix) => { // Convert LikePrefix to a range query: [prefix, next_prefix) match prefix { @@ -1838,7 +2074,7 @@ impl ScalarIndex for BTreeIndex { } } } - }; + }?; // For non-IsNull queries, also include null pages so that null row IDs // are tracked in the result. Any comparison with NULL yields NULL, and @@ -1849,13 +2085,18 @@ impl ScalarIndex for BTreeIndex { // We add them as Matches::Some (not Matches::All) so that // FlatIndex::search() evaluates the predicate and correctly marks // the rows as NULL rather than TRUE. + // + // TODO: the lookup batch retains a per-page `null_count`. A fully-covered + // page with zero nulls is a true Matches::All, while one with nulls needs + // Matches::Some only to track the null rows; surfacing `null_count` here + // could refine that classification (see #6802). if !matches!(query, SargableQuery::IsNull()) { let existing: HashSet = pages.iter().map(|m| m.page_id()).collect(); for &page_id in self .page_lookup .null_pages - .keys() - .chain(self.page_lookup.all_null_pages.keys()) + .iter() + .chain(self.page_lookup.all_null_pages.iter()) { if !existing.contains(&page_id) { pages.push(Matches::Some(page_id)); @@ -2992,8 +3233,7 @@ impl ScalarIndexPlugin for BTreeIndexPlugin { Error::internal("BTreeIndexPlugin::put_in_cache called with a non-BTree index") })?; let state = BTreeIndexState { - page_lookup: btree.page_lookup.clone(), - data_type: btree.data_type.clone(), + lookup_batch: btree.page_lookup.batch.clone(), batch_size: btree.batch_size, ranges_to_files: btree.ranges_to_files.clone(), }; @@ -3010,7 +3250,7 @@ mod tests { use std::{collections::HashMap, sync::Arc}; use arrow::datatypes::{Float32Type, Float64Type, Int32Type, UInt64Type}; - use arrow_array::{FixedSizeListArray, RecordBatch, record_batch}; + use arrow_array::{FixedSizeListArray, record_batch}; use datafusion::{ execution::{SendableRecordBatchStream, TaskContext}, physical_plan::{ExecutionPlan, sorts::sort::SortExec, stream::RecordBatchStreamAdapter}, @@ -3027,7 +3267,6 @@ mod tests { use lance_io::object_store::ObjectStore; use lance_select::{RowAddrTreeMap, RowSetOps}; use object_store::path::Path; - use rangemap::RangeInclusiveMap; use crate::metrics::LocalMetricsCollector; use crate::progress::{IndexBuildProgress, noop_progress}; @@ -3041,12 +3280,14 @@ mod tests { }; use super::{ - BTreeIndexPlugin, BTreeIndexState, BTreeIndexStateKey, BTreePageKey, - DEFAULT_BTREE_BATCH_SIZE, OrderableScalarValue, btree_lookup_as_batch, parse_btree_lookup, - part_lookup_file_path, part_page_data_file_path, train_btree_index, + BTreeIndexPlugin, BTreeIndexState, BTreeLookup, BTreePageKey, DEFAULT_BTREE_BATCH_SIZE, + Matches, OrderableScalarValue, part_lookup_file_path, part_page_data_file_path, + train_btree_index, }; use crate::scalar::registry::ScalarIndexPlugin; + use arrow_array::RecordBatch; use lance_core::cache::{CacheCodecImpl, CacheKey}; + use rangemap::RangeInclusiveMap; lance_testing::define_stage_event_progress!( RecordingProgress, @@ -5023,12 +5264,6 @@ mod tests { } } - #[test] - fn test_btree_page_key_codec() { - // FlatIndex pages can be serialized by a persistent cache backend. - assert!(BTreePageKey::codec().is_some()); - } - fn sample_lookup_batch() -> RecordBatch { record_batch!( ("min", Int32, [Some(0), Some(10), Some(20)]), @@ -5039,62 +5274,642 @@ mod tests { .unwrap() } - fn mixed_null_lookup_batch() -> RecordBatch { - record_batch!( - ("min", Int32, [None, Some(0), Some(10)]), - ("max", Int32, [None, Some(9), Some(19)]), - ("null_count", UInt32, [1, 0, 2]), - ("page_idx", UInt32, [42, 0, 1]) + fn osv(v: i32) -> OrderableScalarValue { + OrderableScalarValue(ScalarValue::Int32(Some(v))) + } + + /// The rewritten [`BTreeLookup`] searches the lookup batch directly, so this + /// exercises the binary-search bounds, duplicate `min` values, a partial-null + /// (null `min`) straddling page, and the `Matches::Some`/`All` classification. + #[test] + fn test_btree_lookup_pages_between() { + // Pages sorted by `min`, NULLs first. Page 0 straddles the NULL/non-NULL + // boundary; pages 2 and 3 share a `min` of 20. + let batch = record_batch!( + ("min", Int32, [None, Some(10), Some(20), Some(20), Some(40)]), + ( + "max", + Int32, + [Some(5), Some(20), Some(20), Some(30), Some(50)] + ), + ("null_count", UInt32, [2, 0, 0, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) ) - .unwrap() + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.null_pages, vec![0]); + assert!(lookup.all_null_pages.is_empty()); + assert_eq!(lookup.search_start, 0); + + let between = |lo: i32, hi: i32| { + let mut m = lookup + .pages_between(( + std::ops::Bound::Included(&osv(lo)), + std::ops::Bound::Included(&osv(hi)), + )) + .unwrap(); + m.sort_by_key(|m| m.page_id()); + m + }; + + // Equality only ever yields partial (Some) matches. + assert_eq!(lookup.pages_eq(&osv(15)).unwrap(), vec![Matches::Some(1)]); + assert_eq!( + lookup.pages_eq(&osv(20)).unwrap(), + vec![Matches::Some(1), Matches::Some(2), Matches::Some(3)] + ); + assert!(lookup.pages_eq(&osv(35)).unwrap().is_empty()); + + // [20, 25]: page 2 ([20, 20]) sits entirely inside -> All; pages 1 and 3 + // only partially overlap -> Some. The null-min page 0 (max 5) is excluded. + assert_eq!( + between(20, 25), + vec![Matches::Some(1), Matches::All(2), Matches::Some(3)] + ); + + // A query below all non-null data still reaches the straddling page 0, + // which is only ever a partial match because its `min` is NULL. + assert_eq!(between(0, 5), vec![Matches::Some(0)]); + + // Unbounded above: page 4 ([40, 50]) is fully covered from 40 onward. + assert_eq!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(40)), + std::ops::Bound::Unbounded + )) + .unwrap(), + vec![Matches::All(4)] + ); + + // Empty / inverted ranges select nothing. + assert!(between(31, 39).is_empty()); + assert!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(25)), + std::ops::Bound::Included(&osv(15)) + )) + .unwrap() + .is_empty() + ); } - fn btree_state( - lookup_batch: RecordBatch, - batch_size: u64, - ranges_to_files: Option>>, - ) -> BTreeIndexState { - let (page_lookup, data_type) = parse_btree_lookup(&lookup_batch).unwrap(); - BTreeIndexState { - page_lookup, - data_type, - batch_size, - ranges_to_files, + /// Exercises the native byte comparator path (`accessor_cmp`) for + /// variable-length `Binary` and fixed-width `FixedSizeBinary` (e.g. UUID) + /// columns, including the null-min straddle page and duplicate `min`s. + #[test] + fn test_btree_lookup_pages_eq_bytes() { + use arrow_array::{ + ArrayRef, BinaryArray, FixedSizeBinaryArray, LargeBinaryArray, LargeStringArray, + UInt32Array, + }; + use arrow_schema::{DataType, Field, Schema}; + + // 2-byte big-endian keys, so lexicographic byte order matches numeric + // order. Same layout as the int test: page 0 is a null-min straddle, + // pages 2 and 3 share `min` 20, and 35 falls in a gap. + fn be(v: u16) -> [u8; 2] { + v.to_be_bytes() } + let mins = [None, Some(10u16), Some(20), Some(20), Some(40)]; + let maxs = [Some(5u16), Some(20), Some(20), Some(30), Some(50)]; + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + + let assert_byte_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(u16) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + + let eq = |v: u16| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + + // IN merges and dedups across values. + let mut in_pages: Vec = lookup + .pages_in([5u16, 15].into_iter().map(|v| OrderableScalarValue(sv(v)))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + in_pages.sort_unstable(); + assert_eq!(in_pages, vec![0, 1]); + }; + + let fsb = |arr: &[Option]| -> ArrayRef { + Arc::new( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( + arr.iter().copied().map(|o| o.map(be)), + 2, + ) + .unwrap(), + ) + }; + assert_byte_lookup(fsb(&mins), fsb(&maxs), &|v| { + ScalarValue::FixedSizeBinary(2, Some(be(v).to_vec())) + }); + + let bin = |arr: &[Option]| -> ArrayRef { + Arc::new(BinaryArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| be(v).to_vec())), + )) + }; + assert_byte_lookup(bin(&mins), bin(&maxs), &|v| { + ScalarValue::Binary(Some(be(v).to_vec())) + }); + + let lbin = |arr: &[Option]| -> ArrayRef { + Arc::new(LargeBinaryArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| be(v).to_vec())), + )) + }; + assert_byte_lookup(lbin(&mins), lbin(&maxs), &|v| { + ScalarValue::LargeBinary(Some(be(v).to_vec())) + }); + + // `LargeUtf8` over zero-padded decimal strings, whose lexicographic order + // matches the numeric order of the keys. + let lstr = |arr: &[Option]| -> ArrayRef { + Arc::new(LargeStringArray::from_iter( + arr.iter().copied().map(|o| o.map(|v| format!("{v:02}"))), + )) + }; + assert_byte_lookup(lstr(&mins), lstr(&maxs), &|v| { + ScalarValue::LargeUtf8(Some(format!("{v:02}"))) + }); } - fn btree_state_from_index(index: &BTreeIndex) -> BTreeIndexState { - BTreeIndexState { - page_lookup: index.page_lookup.clone(), - data_type: index.data_type.clone(), - batch_size: index.batch_size, - ranges_to_files: index.ranges_to_files.clone(), - } + /// Exercises the physical-type reinterpret path: temporal columns (`Date32` + /// over `i32`, `Timestamp` over `i64`) are compared through the integer native + /// path without a dedicated per-type branch. + #[test] + fn test_btree_lookup_pages_eq_temporal() { + use arrow_array::{ArrayRef, Date32Array, TimestampMicrosecondArray, UInt32Array}; + use arrow_schema::{DataType, Field, Schema}; + + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + + let assert_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(i64) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + let eq = |v: i64| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + }; + + // Timestamp (i64-backed) → Int64 native path. + assert_lookup( + Arc::new(TimestampMicrosecondArray::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(TimestampMicrosecondArray::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::TimestampMicrosecond(Some(v), None), + ); + + // Date32 (i32-backed) → Int32 native path. + assert_lookup( + Arc::new(Date32Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Date32Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Date32(Some(v as i32)), + ); + } + + /// Exercises the remaining physical-type dispatch arms that the temporal and + /// byte tests don't reach: every integer width and signedness, `Float16`, and + /// the 128-/256-bit decimal paths. All share the temporal test's numeric layout + /// (mins `[_, 10, 20, 20, 40]`, maxs `[5, 20, 20, 30, 50]`) so the assertions are + /// identical; only the array/scalar type varies. + #[test] + fn test_btree_lookup_pages_eq_numeric_widths() { + use arrow::datatypes::i256; + use arrow_array::{ + ArrayRef, Decimal128Array, Decimal256Array, Float16Array, Int8Array, Int16Array, + UInt8Array, UInt16Array, UInt32Array, UInt64Array, + }; + use arrow_schema::{DataType, Field, Schema}; + use half::f16; + + let null_count = UInt32Array::from(vec![2u32, 0, 0, 0, 0]); + let page_idx = UInt32Array::from(vec![0u32, 1, 2, 3, 4]); + let assert_lookup = + |min_arr: ArrayRef, max_arr: ArrayRef, sv: &dyn Fn(i64) -> ScalarValue| { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("min", min_arr.data_type().clone(), true), + Field::new("max", max_arr.data_type().clone(), true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])), + vec![ + min_arr, + max_arr, + Arc::new(null_count.clone()), + Arc::new(page_idx.clone()), + ], + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + let eq = |v: i64| { + let mut p: Vec = lookup + .pages_eq(&OrderableScalarValue(sv(v))) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + assert_eq!(eq(15), vec![1]); // only page 1 ([10, 20]) + assert_eq!(eq(20), vec![1, 2, 3]); // shared min of 2 & 3, max of 1 + assert!(eq(35).is_empty()); // gap between pages 3 and 4 + assert_eq!(eq(5), vec![0]); // reaches the null-min straddle via its max + }; + + assert_lookup( + Arc::new(Int8Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Int8Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Int8(Some(v as i8)), + ); + assert_lookup( + Arc::new(Int16Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(Int16Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::Int16(Some(v as i16)), + ); + assert_lookup( + Arc::new(UInt8Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt8Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt8(Some(v as u8)), + ); + assert_lookup( + Arc::new(UInt16Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt16Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt16(Some(v as u16)), + ); + assert_lookup( + Arc::new(UInt32Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt32Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt32(Some(v as u32)), + ); + assert_lookup( + Arc::new(UInt64Array::from(vec![ + None, + Some(10), + Some(20), + Some(20), + Some(40), + ])), + Arc::new(UInt64Array::from(vec![ + Some(5), + Some(20), + Some(20), + Some(30), + Some(50), + ])), + &|v| ScalarValue::UInt64(Some(v as u64)), + ); + + let f = |v: f64| f16::from_f64(v); + assert_lookup( + Arc::new(Float16Array::from(vec![ + None, + Some(f(10.0)), + Some(f(20.0)), + Some(f(20.0)), + Some(f(40.0)), + ])), + Arc::new(Float16Array::from(vec![ + Some(f(5.0)), + Some(f(20.0)), + Some(f(20.0)), + Some(f(30.0)), + Some(f(50.0)), + ])), + &|v| ScalarValue::Float16(Some(f(v as f64))), + ); + + // Decimal128 (i128 native path). Comparison is on the raw integer, so a + // scale of 0 lets the values double as plain integers. + let dec128 = |vals: Vec>| -> ArrayRef { + Arc::new( + Decimal128Array::from(vals) + .with_precision_and_scale(18, 0) + .unwrap(), + ) + }; + assert_lookup( + dec128(vec![None, Some(10), Some(20), Some(20), Some(40)]), + dec128(vec![Some(5), Some(20), Some(20), Some(30), Some(50)]), + &|v| ScalarValue::Decimal128(Some(v as i128), 18, 0), + ); + + // Decimal256 (i256 native path). + let dec256 = |vals: Vec>| -> ArrayRef { + Arc::new( + Decimal256Array::from( + vals.into_iter() + .map(|o| o.map(i256::from_i128)) + .collect::>(), + ) + .with_precision_and_scale(40, 0) + .unwrap(), + ) + }; + assert_lookup( + dec256(vec![None, Some(10), Some(20), Some(20), Some(40)]), + dec256(vec![Some(5), Some(20), Some(20), Some(30), Some(50)]), + &|v| ScalarValue::Decimal256(Some(i256::from_i128(v as i128)), 40, 0), + ); + } + + /// Exercises the NULL paths of the lookup directly: `pages_eq(NULL)` and + /// `pages_in` with a NULL in the value list (and a NULL-only list), including + /// the partial-null (`Some`) vs entirely-null (`All`) page classification. + #[test] + fn test_btree_lookup_pages_null() { + // Page 0 is entirely null (null max -> All); page 1 is a partial-null + // straddle (max 5, null_count > 0 -> Some); page 2 also carries a null. + let batch = record_batch!( + ("min", Int32, [None, None, Some(10), Some(20), Some(40)]), + ("max", Int32, [None, Some(5), Some(20), Some(30), Some(50)]), + ("null_count", UInt32, [3, 2, 1, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.all_null_pages, vec![0]); + assert_eq!(lookup.null_pages, vec![1, 2]); + + // pages_eq(NULL) short-circuits to the null pages: partial-null pages are + // `Some`, the entirely-null page is `All`. + assert_eq!( + lookup + .pages_eq(&OrderableScalarValue(ScalarValue::Int32(None))) + .unwrap(), + vec![Matches::Some(1), Matches::Some(2), Matches::All(0)] + ); + + let in_ids = |vals: Vec>| { + let mut p: Vec = lookup + .pages_in( + vals.into_iter() + .map(|v| OrderableScalarValue(ScalarValue::Int32(v))), + ) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect(); + p.sort_unstable(); + p + }; + // Baseline: a non-null value only -> just its value page. + assert_eq!(in_ids(vec![Some(45)]), vec![4]); + // A NULL in the list unions in every null page (0, 1, 2). + assert_eq!(in_ids(vec![Some(45), None]), vec![0, 1, 2, 4]); + // A NULL-only list (empty non-null set) returns exactly the null pages. + assert_eq!(in_ids(vec![None]), vec![0, 1, 2]); + } + + /// A 0-row page_lookup batch (an index over an empty dataset) must yield no + /// candidates for any query rather than panicking on the binary-search bounds. + #[test] + fn test_btree_lookup_empty_batch() { + use arrow_schema::{DataType, Field, Schema}; + + let schema = Arc::new(Schema::new(vec![ + Field::new("min", DataType::Int32, true), + Field::new("max", DataType::Int32, true), + Field::new("null_count", DataType::UInt32, false), + Field::new("page_idx", DataType::UInt32, false), + ])); + let lookup = BTreeLookup::try_new(RecordBatch::new_empty(schema)).unwrap(); + assert_eq!(lookup.search_start, 0); + assert!(lookup.null_pages.is_empty()); + assert!(lookup.all_null_pages.is_empty()); + + assert!(lookup.pages_eq(&osv(5)).unwrap().is_empty()); + assert!(lookup.pages_in([osv(5)]).unwrap().is_empty()); + assert!( + lookup + .pages_between(( + std::ops::Bound::Included(&osv(0)), + std::ops::Bound::Included(&osv(100)), + )) + .unwrap() + .is_empty() + ); + assert!(lookup.pages_null().is_empty()); + } + + /// A straddle page (null `min`, non-null `max`) can sort ahead of an entirely- + /// null page within the leading NULL-`min` group. When it does, `search_start` + /// points at the straddle and the all-null page falls inside the forward-scan + /// window, so both the equality and range scans must skip it (it matches only + /// IS NULL). + #[test] + fn test_btree_lookup_skips_all_null_page_in_scan_window() { + // Page 0: straddle (null min, max 5). Page 1: entirely null (null min/max). + let batch = record_batch!( + ("min", Int32, [None, None, Some(10), Some(20), Some(40)]), + ("max", Int32, [Some(5), None, Some(20), Some(30), Some(50)]), + ("null_count", UInt32, [2, 3, 0, 0, 0]), + ("page_idx", UInt32, [0, 1, 2, 3, 4]) + ) + .unwrap(); + let lookup = BTreeLookup::try_new(batch).unwrap(); + assert_eq!(lookup.search_start, 0); // straddle page 0 has a non-null max + assert_eq!(lookup.all_null_pages, vec![1]); + assert_eq!(lookup.null_pages, vec![0]); + + // Equality for 5 peeks left across the all-null page 1 (index 1, inside the + // scan window) and must skip it, reaching only the straddle page 0. + assert_eq!( + lookup + .pages_eq(&osv(5)) + .unwrap() + .into_iter() + .map(|m| m.page_id()) + .collect::>(), + vec![0] + ); + + // The same all-null page sits inside the range scan window and is skipped: + // page 0 (straddle) is a partial match, pages 2-4 are fully covered. + let mut between = lookup + .pages_between(( + std::ops::Bound::Included(&osv(0)), + std::ops::Bound::Included(&osv(100)), + )) + .unwrap(); + between.sort_by_key(|m| m.page_id()); + assert_eq!( + between, + vec![ + Matches::Some(0), + Matches::All(2), + Matches::All(3), + Matches::All(4), + ] + ); } fn assert_state_roundtrips(state: &BTreeIndexState) { let mut buf = Vec::new(); state.serialize(&mut buf).unwrap(); let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); - assert_eq!(restored.page_lookup, state.page_lookup); - assert_eq!(restored.data_type, state.data_type); + assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(restored.batch_size, state.batch_size); assert_eq!(restored.ranges_to_files, state.ranges_to_files); + } - let restored_batch = - btree_lookup_as_batch(&restored.page_lookup, &restored.data_type).unwrap(); - let (parsed_again, _) = parse_btree_lookup(&restored_batch).unwrap(); - assert_eq!(parsed_again, restored.page_lookup); + #[test] + fn test_btree_page_key_codec() { + // FlatIndex pages can be serialized by a persistent cache backend. + assert!(BTreePageKey::codec().is_some()); } #[test] fn test_btree_index_state_roundtrip() { // Not range-partitioned. - assert_state_roundtrips(&btree_state( - sample_lookup_batch(), - DEFAULT_BTREE_BATCH_SIZE, - None, - )); + assert_state_roundtrips(&BTreeIndexState { + lookup_batch: sample_lookup_batch(), + batch_size: DEFAULT_BTREE_BATCH_SIZE, + ranges_to_files: None, + }); // Range-partitioned across multiple files. let ranges: RangeInclusiveMap = [ @@ -5103,30 +5918,22 @@ mod tests { ] .into_iter() .collect(); - assert_state_roundtrips(&btree_state( - sample_lookup_batch(), - 8192, - Some(Arc::new(ranges)), - )); - - // Empty index keeps its data type even though it has no lookup rows. - assert_state_roundtrips(&btree_state( - RecordBatch::new_empty(sample_lookup_batch().schema()), - DEFAULT_BTREE_BATCH_SIZE, - None, - )); - - // Mixed all-null and non-null pages must round-trip without creating a - // NULL sentinel in a non-empty lookup tree. - assert_state_roundtrips(&btree_state( - mixed_null_lookup_batch(), - DEFAULT_BTREE_BATCH_SIZE, - None, - )); + assert_state_roundtrips(&BTreeIndexState { + lookup_batch: sample_lookup_batch(), + batch_size: 8192, + ranges_to_files: Some(Arc::new(ranges)), + }); + + // Empty index. + assert_state_roundtrips(&BTreeIndexState { + lookup_batch: RecordBatch::new_empty(sample_lookup_batch().schema()), + batch_size: DEFAULT_BTREE_BATCH_SIZE, + ranges_to_files: None, + }); } #[tokio::test] - async fn test_btree_plugin_cache_returns_deserialized_index() { + async fn test_btree_index_state_reconstruct_and_plugin_cache() { let tmpdir = TempObjDir::default(); let test_store = Arc::new(LanceIndexStore::new( Arc::new(ObjectStore::local()), @@ -5145,23 +5952,37 @@ mod tests { let index = BTreeIndex::load(test_store.clone(), None, &LanceCache::no_cache()) .await .unwrap(); - let index_dyn: Arc = index.clone(); + + // Round-trip the state through the codec and reconstruct an index from it. + let state = BTreeIndexState { + lookup_batch: index.page_lookup.batch.clone(), + batch_size: index.batch_size, + ranges_to_files: index.ranges_to_files.clone(), + }; + let mut buf = Vec::new(); + state.serialize(&mut buf).unwrap(); + let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let reconstructed = restored + .reconstruct(test_store.clone(), &LanceCache::no_cache(), None) + .unwrap(); + assert_eq!( + reconstructed + .as_any() + .downcast_ref::() + .unwrap() + .page_lookup, + index.page_lookup + ); + + // The plugin's put/get hooks round-trip through a real cache + the codec. let cache = LanceCache::with_capacity(64 * 1024 * 1024); let plugin = BTreeIndexPlugin; - plugin - .put_in_cache(&cache, index_dyn.clone()) - .await - .unwrap(); + plugin.put_in_cache(&cache, index.clone()).await.unwrap(); let from_cache = plugin .get_from_cache(test_store.clone(), None, &cache) .await .unwrap() .expect("index should be served from the cache"); - let cached_btree = from_cache.as_any().downcast_ref::().unwrap(); - assert!( - Arc::ptr_eq(&cached_btree.page_lookup, &index.page_lookup), - "BTree cache should reuse the parsed lookup tree" - ); // Searches against the cached index match the original. let query = SargableQuery::Range( @@ -5176,51 +5997,9 @@ mod tests { assert_eq!(expected, actual); } - #[tokio::test] - async fn test_btree_index_state_cache_size_includes_parsed_lookup() { - let tmpdir = TempObjDir::default(); - let test_store = Arc::new(LanceIndexStore::new( - Arc::new(ObjectStore::local()), - tmpdir.clone(), - Arc::new(LanceCache::no_cache()), - )); - - let stream = gen_batch() - .col("value", array::step::()) - .col("_rowid", array::step::()) - .into_df_stream(RowCount::from(1000), BatchCount::from(5)); - train_btree_index(stream, test_store.as_ref(), 1000, None, None) - .await - .unwrap(); - - let index = BTreeIndex::load(test_store.clone(), None, &LanceCache::no_cache()) - .await - .unwrap(); - let cache = LanceCache::with_capacity(64 * 1024 * 1024); - let plugin = BTreeIndexPlugin; - plugin.put_in_cache(&cache, index.clone()).await.unwrap(); - - let cached_state = cache - .get_with_key(&BTreeIndexStateKey) - .await - .expect("state should be cached"); - assert!( - Arc::ptr_eq(&cached_state.page_lookup, &index.page_lookup), - "cached state should retain the parsed lookup tree" - ); - - let arc_overhead = std::mem::size_of::() * 2; - let expected_size = cached_state.deep_size_of() + arc_overhead; - let charged_size = cache.size_bytes().await; - let size_diff = charged_size.abs_diff(expected_size); - assert!( - size_diff <= std::mem::size_of::() * 2, - "cache charged {charged_size} bytes, expected about {expected_size} bytes" - ); - } - #[test] fn test_btree_index_state_rejects_invalid_has_ranges_tag() { + // u64 batch_size (any) then a bad has_ranges tag. let mut buf = Vec::new(); buf.extend_from_slice(&1000u64.to_le_bytes()); buf.push(7u8); @@ -5232,67 +6011,6 @@ mod tests { ); } - #[tokio::test] - async fn test_btree_load_applies_frag_reuse_index() { - use crate::frag_reuse::{FragReuseIndex, FragReuseIndexDetails}; - use std::collections::HashMap; - use uuid::Uuid; - - let tmpdir = TempObjDir::default(); - let test_store = Arc::new(LanceIndexStore::new( - Arc::new(ObjectStore::local()), - tmpdir.clone(), - Arc::new(LanceCache::no_cache()), - )); - - // value == _rowid for all rows in [0, 1000). - let stream = gen_batch() - .col("value", array::step::()) - .col("_rowid", array::step::()) - .into_df_stream(RowCount::from(1000), BatchCount::from(1)); - train_btree_index(stream, test_store.as_ref(), 1000, None, None) - .await - .unwrap(); - - // Remap row 0 -> row 5000 (outside the original [0, 1000) range so no collision). - // Querying for value == 0 should now return row 5000, confirming the - // BTree load path threads the FragReuseIndex into the index. - let frag_reuse_index = Arc::new(FragReuseIndex::new( - Uuid::new_v4(), - vec![HashMap::from([(0u64, Some(5000u64))])], - FragReuseIndexDetails { versions: vec![] }, - )); - let index = BTreeIndex::load( - test_store.clone(), - Some(frag_reuse_index), - &LanceCache::no_cache(), - ) - .await - .unwrap(); - - let result = index - .search( - &SargableQuery::Equals(ScalarValue::Int32(Some(0))), - &NoOpMetricsCollector, - ) - .await - .unwrap(); - let row_ids: Vec = match &result { - SearchResult::Exact(set) => set - .true_rows() - .row_addrs() - .unwrap() - .map(u64::from) - .collect(), - other => panic!("expected Exact, got {other:?}"), - }; - assert_eq!( - row_ids, - vec![5000], - "frag_reuse_index remap was not applied" - ); - } - #[tokio::test] async fn test_btree_index_state_reconstruct_applies_frag_reuse_index() { use crate::frag_reuse::{FragReuseIndex, FragReuseIndexDetails}; @@ -5318,8 +6036,15 @@ mod tests { let index = BTreeIndex::load(test_store.clone(), None, &LanceCache::no_cache()) .await .unwrap(); - let state = btree_state_from_index(&index); + let state = BTreeIndexState { + lookup_batch: index.page_lookup.batch.clone(), + batch_size: index.batch_size, + ranges_to_files: index.ranges_to_files.clone(), + }; + // Remap row 0 -> row 5000 (outside the original [0, 1000) range so no collision). + // Querying for value == 0 should now return row 5000, confirming reconstruct threaded + // the FragReuseIndex through to the rebuilt BTreeIndex. let frag_reuse_index = Arc::new(FragReuseIndex::new( Uuid::new_v4(), vec![HashMap::from([(0u64, Some(5000u64))])], @@ -5357,10 +6082,10 @@ mod tests { } #[tokio::test] - async fn test_btree_range_partitioned_plugin_cache_roundtrip() { + async fn test_btree_index_state_range_partitioned_plugin_cache_roundtrip() { // Build a range-partitioned BTree (two range partitions merged into one index) and // round-trip it through the plugin's cache hooks. This exercises the - // `ranges_to_files = Some` path end-to-end. + // `ranges_to_files = Some` path end-to-end through serialize/deserialize/reconstruct. let tmpdir = TempObjDir::default(); let store = Arc::new(LanceIndexStore::new( Arc::new(ObjectStore::local()), From e8748a405821deeddcc543e351c1ffd7ff41cb13 Mon Sep 17 00:00:00 2001 From: everySympathy Date: Thu, 11 Jun 2026 09:52:38 +0800 Subject: [PATCH 083/177] feat(python): expose zonemap segment builds (#7177) --- python/python/lance/dataset.py | 49 +++++++++++++--- python/python/tests/test_scalar_index.py | 73 ++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 9 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 831e194f9f3..e96d9305ce5 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3074,6 +3074,38 @@ def _prepare_scalar_index_request( else: raise Exception("index_type must be str or IndexConfig") + @staticmethod + def _normalized_index_type( + index_type: Union[str, IndexConfig], + ) -> str: + if isinstance(index_type, IndexConfig): + index_type = index_type.index_type + return index_type.upper() + + @classmethod + def _is_segment_native_scalar_index_type( + cls, + index_type: Union[str, IndexConfig], + ) -> bool: + return cls._normalized_index_type(index_type) in { + "BTREE", + "BITMAP", + "INVERTED", + "FTS", + "ZONEMAP", + } + + @classmethod + def _requires_uncommitted_scalar_index( + cls, + index_type: Union[str, IndexConfig], + ) -> bool: + return cls._normalized_index_type(index_type) in { + "BTREE", + "BITMAP", + "ZONEMAP", + } + def create_scalar_index( self, column: str, @@ -3291,7 +3323,9 @@ def create_scalar_index( column, index_type, kwargs ) - if fragment_ids is not None and logical_index_type in {"BTREE", "BITMAP"}: + if fragment_ids is not None and self._requires_uncommitted_scalar_index( + logical_index_type + ): raise ValueError( f"{logical_index_type} distributed indexing uses " "create_index_uncommitted(..., " @@ -4004,7 +4038,8 @@ def create_index_uncommitted( Create one segment without publishing it and return its metadata. This is the public distributed-build API for vector, BTREE scalar, - canonical bitmap scalar, and INVERTED scalar index construction. Unlike + canonical bitmap scalar, INVERTED scalar, and ZONEMAP scalar index + construction. Unlike :meth:`create_index`, this method does not publish the index into the dataset manifest. Instead, it writes one segment under ``_indices//`` and returns the resulting @@ -4020,7 +4055,7 @@ def create_index_uncommitted( 4. commit the final segment list with :meth:`commit_existing_index_segments` - BTREE, BITMAP and INVERTED segments may + BTREE, BITMAP, INVERTED, and ZONEMAP segments may be merged with :meth:`merge_existing_index_segments` before commit. Parameters are the same as :meth:`create_index`, with one additional requirement: @@ -4047,12 +4082,8 @@ def create_index_uncommitted( Index Metadata for the segment that was written by this call. """ - is_scalar_segment_request = ( - isinstance(index_type, str) - and index_type.upper() in {"BTREE", "BITMAP", "INVERTED", "FTS"} - ) or ( - isinstance(index_type, IndexConfig) - and index_type.index_type.upper() in {"BTREE", "BITMAP", "INVERTED", "FTS"} + is_scalar_segment_request = self._is_segment_native_scalar_index_type( + index_type ) if is_scalar_segment_request: if fragment_ids is None: diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 191abac2b18..7ddfbbc0dc8 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -4054,6 +4054,79 @@ def test_bitmap_uncommitted_segments_can_be_committed_from_python(tmp_path): ) +def test_zonemap_fragment_ids_parameter_validation(tmp_path): + ds = generate_multi_fragment_dataset( + tmp_path, num_fragments=2, rows_per_fragment=100 + ) + + fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] + with pytest.raises(ValueError, match="create_index_uncommitted"): + ds.create_scalar_index( + column="id", + index_type="ZONEMAP", + fragment_ids=[fragment_ids[0]], + ) + + +def test_zonemap_segment_merge_and_commit_from_python(tmp_path): + rows_per_fragment = 20_000 + ds = generate_multi_fragment_dataset( + tmp_path, num_fragments=4, rows_per_fragment=rows_per_fragment + ) + + index_name = "id_zonemap_segments" + fragment_ids = [fragment.fragment_id for fragment in ds.get_fragments()] + staged_segments = [ + ds.create_index_uncommitted( + column="id", + index_type="ZONEMAP", + name=index_name, + fragment_ids=[fragment_id], + ) + for fragment_id in fragment_ids + ] + + assert len({segment.uuid for segment in staged_segments}) == len(staged_segments) + for segment, fragment_id in zip(staged_segments, fragment_ids): + files = segment.files + assert files is not None + assert segment.fragment_ids == {fragment_id} + assert any(file.path == "zonemap.lance" for file in files) + assert all(not file.path.startswith("part_") for file in files) + + merged_segment = ds.merge_existing_index_segments(staged_segments) + merged_files = merged_segment.files + assert merged_files is not None + assert merged_segment.uuid not in {segment.uuid for segment in staged_segments} + assert merged_segment.fragment_ids == set(fragment_ids) + assert any(file.path == "zonemap.lance" for file in merged_files) + assert all(not file.path.startswith("part_") for file in merged_files) + + ds = ds.commit_existing_index_segments(index_name, "id", [merged_segment]) + descriptions = {index.name: index for index in ds.describe_indices()} + assert descriptions[index_name].index_type == "ZoneMap" + assert len(descriptions[index_name].segments) == 1 + + filter_expr = "id >= 8200 AND id < 8300" + without_index = ds.scanner( + filter=filter_expr, + columns=["id", "text"], + use_scalar_index=False, + ).to_table() + with_index = ds.scanner( + filter=filter_expr, + columns=["id", "text"], + use_scalar_index=True, + ).to_table() + + assert with_index.num_rows == without_index.num_rows + assert with_index["id"].to_pylist() == without_index["id"].to_pylist() + assert ( + "ScalarIndexQuery" + in ds.scanner(filter=filter_expr, use_scalar_index=True).explain_plan() + ) + + def test_merge_index_metadata_btree_soft_break(tmp_path): ds = generate_multi_fragment_dataset( tmp_path, num_fragments=2, rows_per_fragment=100 From c25632605e2698f63aa01ffb5985141b87bc45a6 Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Thu, 11 Jun 2026 12:55:33 +0700 Subject: [PATCH 084/177] feat(index): expose per-query I/O metrics on ANN operators (#7204) ## Summary Closes #7201. The vector-search operators `ANNSubIndex` and `ANNIvfPartition` only reported index metrics and registered no `IoMetrics`, so index-file I/O was invisible in `EXPLAIN ANALYZE`. This makes per-query `bytes_read` / `iops` / `requests` observable on both operators. ## Design The ANN operators delegate reads to a cached, shared `IVFIndex` whose `ScanScheduler` is reused across queries, so the established "own a fresh scheduler and read its cumulative stats" pattern does not apply. A lightweight per-query sink (`IoStats`, backed by a new `IoStatsRecorder` trait in `lance-core`) rides on the already per-query `MetricsCollector` and is attached to the index / quantization-storage readers only on an actual cache-miss load, tapping the authoritative `FileScheduler` accounting point. Numbers are exact, correctly attributed, and ~0 on a warm cache. The one-time index-open I/O (file footers, IVF centroids, quantization metadata) is captured from the fresh `IVFIndex::try_new` scheduler and attributed to `ANNIvfPartition`. Always on, consistent with the data-path operators; surfaces through the existing scan-statistics / EXPLAIN ANALYZE path with no DataFusion-layer change. ## Observability (EXPLAIN ANALYZE) Cold query (empty index cache): - `ANNSubIndex: ... parts_loaded=10, bytes_read=57.02 K, iops=40, requests=40` - `ANNIvfPartition: ... bytes_read=8.11 K, iops=4, requests=4` (index-open I/O) Warm query (partitions cached): both operators report `bytes_read=0, iops=0, requests=0`. ## Overhead Adding metrics is observability, not a speedup. Measured via a same-binary A/B (feature on vs off, which isolates runtime cost from code-layout noise): `Ivf_PQ_NoCache` change `[-0.57%, -0.08%, +0.43%]`, p=0.76, "No change in performance detected". ## Tests - `lance-io`: the per-query sink records identically to the scheduler and stays isolated from sibling file handles. - `lance`: cold-vs-warm aggregate counts (`ExecutionSummaryCounts`); and an end-to-end `EXPLAIN ANALYZE` assertion that the metrics are non-zero (cold) and zero (warm) on both ANN operators. --------- Co-authored-by: Vova Kolmakov Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-core/src/utils.rs | 1 + rust/lance-core/src/utils/io_stats.rs | 30 +++++ rust/lance-encoding/src/lib.rs | 16 +++ rust/lance-file/src/io.rs | 10 ++ rust/lance-file/src/reader.rs | 17 +++ rust/lance-index/src/metrics.rs | 13 +++ rust/lance-index/src/vector.rs | 8 ++ rust/lance-index/src/vector/storage.rs | 22 +++- rust/lance-io/src/scheduler.rs | 153 ++++++++++++++++++++++++- rust/lance/src/index.rs | 9 ++ rust/lance/src/index/vector/builder.rs | 2 +- rust/lance/src/index/vector/ivf.rs | 2 +- rust/lance/src/index/vector/ivf/v2.rs | 62 ++++++++-- rust/lance/src/io/exec/knn.rs | 128 +++++++++++++++++++++ rust/lance/src/io/exec/utils.rs | 36 +++++- 15 files changed, 485 insertions(+), 24 deletions(-) create mode 100644 rust/lance-core/src/utils/io_stats.rs diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs index 8f16744b158..c202329838c 100644 --- a/rust/lance-core/src/utils.rs +++ b/rust/lance-core/src/utils.rs @@ -12,6 +12,7 @@ pub mod cpu; pub mod deletion; pub mod futures; pub mod hash; +pub mod io_stats; pub mod parse; pub mod path; pub mod tempfile; diff --git a/rust/lance-core/src/utils/io_stats.rs b/rust/lance-core/src/utils/io_stats.rs new file mode 100644 index 00000000000..e2169d71ae3 --- /dev/null +++ b/rust/lance-core/src/utils/io_stats.rs @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::ops::Range; + +/// A sink that records I/O requests as they are submitted to storage. +/// +/// This lives in `lance-core` so that the encoding layer (`lance-encoding`) and +/// the I/O layer (`lance-io`) can both refer to it without depending on one +/// another. It lets a caller attach a lightweight counter to a file reader and +/// measure the exact bytes/IOPS performed for a bounded scope (e.g. a single +/// query); see `lance_io::scheduler::IoStats` for the concrete implementation. +/// +/// # When to use this +/// +/// Lance also exposes two *process-wide, cumulative* I/O accounting facilities: +/// the global scheduler counters (`lance_io::scheduler::iops_counter` / +/// `bytes_read_counter`) and the object-store `IOTracker` wrapper used in tests. +/// Both aggregate every read in the process and cannot attribute I/O to a single +/// bounded scope. Prefer an `IoStatsRecorder` when you need the *exact* I/O of +/// one operation (e.g. a single query): attach it to a reader with +/// `with_io_stats`, then read the snapshot when the scope ends. It re-uses the +/// reader's cached metadata, so measuring costs no extra file opens and does not +/// disturb the global counters. +pub trait IoStatsRecorder: std::fmt::Debug + Send + Sync { + /// Record one completed request, given the byte ranges as actually + /// submitted to storage (i.e. after any coalescing/splitting), so the + /// counts reflect physical I/O. + fn record_request(&self, ranges: &[Range]); +} diff --git a/rust/lance-encoding/src/lib.rs b/rust/lance-encoding/src/lib.rs index cb4062d3220..a58e0a14c59 100644 --- a/rust/lance-encoding/src/lib.rs +++ b/rust/lance-encoding/src/lib.rs @@ -86,6 +86,22 @@ pub trait EncodingsIo: std::fmt::Debug + Send + Sync { fn with_bypass_backpressure(&self) -> Option> { None } + + /// Returns a version of this I/O service that additionally records the I/O it + /// performs into `stats`, on top of any global accounting. This is the seam + /// used to measure exact per-scope (e.g. per-query) I/O without re-opening + /// files: wrap a reader's I/O service, perform the reads, then inspect the + /// recorder. + /// + /// Returns `None` if this implementation does not support per-scope I/O + /// statistics (e.g. in-memory or test schedulers), in which case the caller + /// should fall back to using self (and no statistics are recorded). + fn with_io_stats( + &self, + _stats: Arc, + ) -> Option> { + None + } } /// An implementation of EncodingsIo that serves data from an in-memory buffer diff --git a/rust/lance-file/src/io.rs b/rust/lance-file/src/io.rs index c09e9d8d372..1a8edf92b08 100644 --- a/rust/lance-file/src/io.rs +++ b/rust/lance-file/src/io.rs @@ -38,6 +38,16 @@ impl EncodingsIo for LanceEncodingsIo { })) } + fn with_io_stats( + &self, + stats: Arc, + ) -> Option> { + Some(Arc::new(Self { + scheduler: self.scheduler.with_io_stats(stats), + read_chunk_size: self.read_chunk_size, + })) + } + fn submit_request( &self, ranges: Vec>, diff --git a/rust/lance-file/src/reader.rs b/rust/lance-file/src/reader.rs index 9e4e4c449a4..c454f73819e 100644 --- a/rust/lance-file/src/reader.rs +++ b/rust/lance-file/src/reader.rs @@ -470,6 +470,23 @@ impl FileReader { } } + /// Returns a clone of this reader whose I/O is additionally recorded into + /// `stats`, on top of the scheduler's global accounting. + /// + /// All cached metadata is shared with `self`, so no file is re-opened and + /// only a few `Arc` clones are performed. If the underlying I/O service + /// does not support per-scope statistics (e.g. an in-memory scheduler), the + /// returned reader is an ordinary, uninstrumented clone. + pub fn with_io_stats( + &self, + stats: Arc, + ) -> Self { + match self.scheduler.with_io_stats(stats) { + Some(scheduler) => self.with_scheduler(scheduler), + None => self.clone(), + } + } + pub fn num_rows(&self) -> u64 { self.num_rows } diff --git a/rust/lance-index/src/metrics.rs b/rust/lance-index/src/metrics.rs index 9e2161ae8f9..37e2c43d196 100644 --- a/rust/lance-index/src/metrics.rs +++ b/rust/lance-index/src/metrics.rs @@ -43,6 +43,19 @@ pub trait MetricsCollector: Send + Sync { /// /// The goal is to provide some visibility into the compute cost of the search fn record_comparisons(&self, num_comparisons: usize); + + /// Returns an optional sink for recording exact I/O statistics (bytes read, + /// IOPS, and requests) performed on behalf of this collector. + /// + /// Index implementations that read from a + /// [`lance_io::scheduler::ScanScheduler`] can attach the returned handle to + /// their file readers so the I/O performed for a single query is measured + /// and attributed here. The default returns `None`, meaning the caller does + /// not want I/O measured (and index implementations should then take their + /// normal, uninstrumented read path). + fn io_stats(&self) -> Option { + None + } } /// A no-op metrics collector that does nothing diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs index d0df2fcb7e2..3c5a6601a8a 100644 --- a/rust/lance-index/src/vector.rs +++ b/rust/lance-index/src/vector.rs @@ -419,6 +419,14 @@ pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index { /// the index type of this vector index. fn sub_index_type(&self) -> (SubIndexType, QuantizationType); + + /// The cumulative I/O performed while opening this index (file footers, IVF + /// centroids, quantization metadata). This is a one-time cost; it is + /// reported once, on the query that actually opens the index, and is `None` + /// for index implementations that do not track it. + fn open_io_stats(&self) -> Option { + None + } } // it can be an IVF index or a partition of IVF index diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index b036e187b77..8c091402687 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -14,10 +14,12 @@ use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::FilterExpression; use lance_file::reader::FileReader; use lance_io::ReadBatchParams; +use lance_io::scheduler::IoStats; use lance_linalg::distance::DistanceType; use prost::Message; use std::{ any::Any, + borrow::Cow, collections::BinaryHeap, mem::size_of, ops::{Deref, DerefMut}, @@ -620,15 +622,29 @@ impl IvfQuantizationStorage { self.ivf.num_partitions() } - pub async fn load_partition(&self, part_id: usize) -> Result { + /// Load a partition's quantization storage, optionally measuring the exact + /// I/O it performs into `io_stats`. + /// + /// When `io_stats` is `Some`, the partition is read through a reader whose + /// scheduler also records into the sink (a cheap clone that shares all + /// cached metadata, so no file is re-opened). When `None`, the normal + /// uninstrumented reader is used. + pub async fn load_partition( + &self, + part_id: usize, + io_stats: Option, + ) -> Result { let range = self.ivf.row_range(part_id); let batch = if range.is_empty() { let schema = self.reader.schema(); let arrow_schema = arrow_schema::Schema::from(schema.as_ref()); RecordBatch::new_empty(Arc::new(arrow_schema)) } else { - let batches = self - .reader + let reader = match &io_stats { + Some(io_stats) => Cow::Owned(self.reader.with_io_stats(io_stats.recorder())), + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(range), u32::MAX, diff --git a/rust/lance-io/src/scheduler.rs b/rust/lance-io/src/scheduler.rs index 4f43cb00668..efe4b9b0c24 100644 --- a/rust/lance-io/src/scheduler.rs +++ b/rust/lance-io/src/scheduler.rs @@ -15,6 +15,7 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; use tokio::sync::Notify; +use lance_core::utils::io_stats::IoStatsRecorder; use lance_core::utils::parse::str_is_truthy; use lance_core::{Error, Result}; @@ -475,8 +476,25 @@ impl StatsCollector { Ordering::Relaxed, ); } + + /// Add already-aggregated counts (e.g. a snapshot captured from another + /// scheduler) into these counters. + fn add(&self, iops: u64, requests: u64, bytes_read: u64) { + self.iops.fetch_add(iops, Ordering::Relaxed); + self.requests.fetch_add(requests, Ordering::Relaxed); + self.bytes_read.fetch_add(bytes_read, Ordering::Relaxed); + } } +impl IoStatsRecorder for StatsCollector { + fn record_request(&self, request: &[Range]) { + // Inherent methods take precedence in resolution, so this delegates to + // the inherent `record_request` above rather than recursing. + Self::record_request(self, request) + } +} + +#[derive(Debug, Clone, Copy, Default)] pub struct ScanStats { pub iops: u64, pub requests: u64, @@ -493,6 +511,57 @@ impl ScanStats { } } +/// A shareable, cloneable handle to a set of cumulative I/O counters. +/// +/// All clones share the same underlying counters. This serves two purposes: +/// +/// 1. It backs each [`ScanScheduler`]'s own running totals. +/// 2. It can be attached to an individual [`FileScheduler`] (via +/// [`FileScheduler::with_io_stats`]) as a *secondary* sink, so a caller can +/// measure the exact bytes/IOPS performed through that file handle for a +/// bounded scope (e.g. a single query) without disturbing the scheduler's +/// global totals. Read the result back with [`IoStats::snapshot`]. +#[derive(Debug, Clone)] +pub struct IoStats(Arc); + +impl IoStats { + pub fn new() -> Self { + Self(Arc::new(StatsCollector::new())) + } + + /// Record a single completed request. `request` holds the byte ranges as + /// actually submitted to storage (post coalescing/splitting), so the counts + /// reflect physical I/O. + pub fn record_request(&self, request: &[Range]) { + self.0.record_request(request); + } + + /// Take an immutable snapshot of the current cumulative counters. + pub fn snapshot(&self) -> ScanStats { + ScanStats::new(self.0.as_ref()) + } + + /// Return this handle as a type-erased [`IoStatsRecorder`], suitable for + /// attaching to a file reader (e.g. `FileReader::with_io_stats`). The + /// returned recorder shares the same underlying counters as `self`. + pub fn recorder(&self) -> Arc { + self.0.clone() + } + + /// Add a snapshot of already-aggregated statistics into this sink. Used to + /// fold in I/O measured on a separate scheduler (e.g. the one-time reads + /// performed while opening an index). + pub fn add_scan_stats(&self, stats: &ScanStats) { + self.0.add(stats.iops, stats.requests, stats.bytes_read); + } +} + +impl Default for IoStats { + fn default() -> Self { + Self::new() + } +} + enum IoQueueType { Standard(Arc), Lite(Arc), @@ -509,7 +578,7 @@ enum IoQueueType { pub struct ScanScheduler { object_store: Arc, io_queue: IoQueueType, - stats: Arc, + stats: IoStats, } impl Debug for ScanScheduler { @@ -606,7 +675,7 @@ impl ScanScheduler { Arc::new(Self { object_store, io_queue, - stats: Arc::new(StatsCollector::new()), + stats: IoStats::new(), }) } @@ -646,6 +715,7 @@ impl ScanScheduler { base_priority, max_iop_size, bypass_backpressure: false, + extra_stats: None, }) } @@ -791,7 +861,7 @@ impl ScanScheduler { } pub fn stats(&self) -> ScanStats { - ScanStats::new(self.stats.as_ref()) + self.stats.snapshot() } #[cfg(test)] @@ -829,6 +899,10 @@ pub struct FileScheduler { base_priority: u64, max_iop_size: u64, bypass_backpressure: bool, + /// Optional secondary statistics sink. When set, every request submitted + /// through this handle is also recorded here, in addition to the + /// scheduler's global totals. Used to measure per-scope I/O. + extra_stats: Option>, } fn is_close_together(range1: &Range, range2: &Range, block_size: u64) -> bool { @@ -899,6 +973,9 @@ impl FileScheduler { } self.root.stats.record_request(&updated_requests); + if let Some(extra_stats) = &self.extra_stats { + extra_stats.record_request(&updated_requests); + } let bytes_vec_fut = self.root.submit_request( self.reader.clone(), @@ -964,6 +1041,23 @@ impl FileScheduler { max_iop_size: self.max_iop_size, base_priority: priority, bypass_backpressure: self.bypass_backpressure, + extra_stats: self.extra_stats.clone(), + } + } + + /// Returns a copy of this scheduler that additionally records the I/O it + /// performs into `stats`, on top of the scheduler's global statistics. + /// + /// This is the mechanism for measuring exact per-scope (e.g. per-query) I/O: + /// attach a recorder here (e.g. via [`IoStats::recorder`]), perform the reads + /// through the returned handle, then read the totals back with + /// [`IoStats::snapshot`]. The returned handle is cheap to create (a few + /// `Arc` clones) and reuses the same underlying reader, so it does not + /// re-open the file. + pub fn with_io_stats(&self, stats: Arc) -> Self { + Self { + extra_stats: Some(stats), + ..self.clone() } } @@ -1183,6 +1277,59 @@ mod tests { assert_eq!(11, scheduler.stats().iops); } + #[tokio::test] + async fn test_io_stats_sink() { + let tmp_file = TempObjFile::default(); + let obj_store = Arc::new(ObjectStore::local()); + + const DATA_SIZE: u64 = 1024 * 1024; + let mut some_data = vec![0; DATA_SIZE as usize]; + rand::rng().fill_bytes(&mut some_data); + obj_store.put(&tmp_file, &some_data).await.unwrap(); + + let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); + + // Attach a per-scope sink to one file handle. + let sink = IoStats::new(); + let file_scheduler = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap() + .with_io_stats(sink.recorder()); + + // Three reads within 4KiB coalesce into a single physical IOP. The sink + // and the scheduler's global totals must agree exactly, because both are + // recorded from the same post-coalescing request. + file_scheduler + .submit_request(vec![50_000..51_000, 52_000..53_000, 54_000..55_000], 0) + .await + .unwrap(); + + let global = scheduler.stats(); + let scoped = sink.snapshot(); + assert_eq!(1, scoped.iops); + assert_eq!(1, scoped.requests); + // Coalesced range 50_000..55_000 => 5000 physical bytes. + assert_eq!(5000, scoped.bytes_read); + assert_eq!(global.iops, scoped.iops); + assert_eq!(global.requests, scoped.requests); + assert_eq!(global.bytes_read, scoped.bytes_read); + + // A sibling handle without the sink: the global totals advance but the + // sink stays put, proving per-scope isolation. + let other = scheduler + .open_file(&tmp_file, &CachedFileSize::unknown()) + .await + .unwrap(); + other.submit_request(vec![0..1000], 0).await.unwrap(); + + let global_after = scheduler.stats(); + let scoped_after = sink.snapshot(); + assert_eq!(global.bytes_read + 1000, global_after.bytes_read); + assert_eq!(scoped.bytes_read, scoped_after.bytes_read); + assert_eq!(scoped.iops, scoped_after.iops); + } + #[tokio::test] async fn test_priority() { let some_path = Path::parse("foo").unwrap(); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 8984d507408..78e5c429527 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -2158,6 +2158,15 @@ impl DatasetIndexInternalExt for Dataset { }; let (index, ivf_entry) = result?; metrics.record_index_load(); + // Attribute the one-time index-open I/O (file footers, IVF centroids, + // quantization metadata) to this query's metrics. This runs only on a + // real open; cache hits return earlier, so a warm query reports zero + // index-open I/O. + if let Some(io_stats) = metrics.io_stats() + && let Some(open_stats) = index.open_io_stats() + { + io_stats.add_scan_stats(&open_stats); + } if let Some(ivf_entry) = ivf_entry { let state_key = IvfIndexStateCacheKey::new(uuid, frag_reuse_uuid.as_ref()); self.index_cache diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs index 579449cc087..1e4fec8c762 100644 --- a/rust/lance/src/index/vector/builder.rs +++ b/rust/lance/src/index/vector/builder.rs @@ -1045,7 +1045,7 @@ impl IvfIndexBuilder continue; } - let part_storage = existing_index.load_partition_storage(part_id).await?; + let part_storage = existing_index.load_partition_storage(part_id, None).await?; let mut part_batches = part_storage.to_batches()?.collect::>(); // for PQ, the PQ codes are transposed, so we need to transpose them back match Q::quantization_type() { diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index 579990fc03b..fb01339ead9 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -6266,7 +6266,7 @@ mod tests { ); // PQ code is on residual space - let pq_store = ivf_idx.load_partition_storage(0).await.unwrap(); + let pq_store = ivf_idx.load_partition_storage(0, None).await.unwrap(); pq_store .codebook() .values() diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 4ea076ed420..29d9e224970 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -7,6 +7,7 @@ use std::io::Write as IoWrite; use std::marker::PhantomData; use std::{ any::Any, + borrow::Cow, collections::{BinaryHeap, HashMap}, sync::{Arc, Mutex}, }; @@ -64,7 +65,7 @@ use lance_index::{ }; use lance_index::{INDEX_METADATA_SCHEMA_KEY, IndexMetadata}; use lance_io::local::to_local_path; -use lance_io::scheduler::SchedulerConfig; +use lance_io::scheduler::{IoStats, ScanStats, SchedulerConfig}; use lance_io::utils::CachedFileSize; use lance_io::{ ReadBatchParams, object_store::ObjectStore, scheduler::ScanScheduler, traits::Reader, @@ -614,6 +615,11 @@ pub struct IVFIndex { index_cache: WeakLanceCache, io_parallelism: usize, + /// Cumulative I/O performed while opening this index (file footers, IVF + /// centroids, quantization metadata). Captured once in `try_new`; exposed + /// via [`VectorIndex::open_io_stats`] so the opening query can attribute the + /// one-time open cost to its plan metrics. + open_io_stats: ScanStats, scratch_pool: Arc, use_query_residual: bool, use_residual_scratch: bool, @@ -1090,6 +1096,12 @@ impl IVFIndex { let use_residual_scratch = Self::use_residual_scratch(&ivf, use_query_residual); let rq_search_cache = Self::build_rq_search_cache(&ivf, &storage)?; + // The scheduler is freshly created above and, at this point, has served + // only the open-time reads (file footers, IVF centroids, quantization + // metadata) -- partition reads happen later, during queries. So its + // cumulative stats are exactly the one-time index-open I/O. + let open_io_stats = scheduler.stats(); + Ok(Self { uri: to_local_path(&uri), index_path: uri.as_ref().to_string(), @@ -1105,6 +1117,7 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + open_io_stats, _marker: PhantomData, }) } @@ -1142,6 +1155,10 @@ impl IVFIndex { distance_type, index_cache: WeakLanceCache::from(&index_cache), io_parallelism, + // Reconstruction from cached state re-opens readers on its own path; + // the open-time I/O is not attributed here (it is a one-time cost, + // and the first open via `try_new` already accounts for it). + open_io_stats: ScanStats::default(), _marker: PhantomData, } } @@ -1169,7 +1186,8 @@ impl IVFIndex { .get_or_insert_with_key(cache_key, || async { info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - self.load_partition_entry(partition_id).await + self.load_partition_entry(partition_id, metrics.io_stats()) + .await }) .await?; Ok(entry as Arc) @@ -1179,11 +1197,18 @@ impl IVFIndex { } info!(target: TRACE_IO_EVENTS, r#type=IO_TYPE_LOAD_VECTOR_PART, index_type="ivf", part_id=partition_id); metrics.record_part_load(); - Ok(Arc::new(self.load_partition_entry(partition_id).await?)) + Ok(Arc::new( + self.load_partition_entry(partition_id, metrics.io_stats()) + .await?, + )) } } - async fn load_partition_entry(&self, partition_id: usize) -> Result> { + async fn load_partition_entry( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result> { let schema = Arc::new(self.reader.schema().as_ref().into()); let batch = match self.reader.metadata().num_rows { 0 => RecordBatch::new_empty(schema), @@ -1192,8 +1217,17 @@ impl IVFIndex { if row_range.is_empty() { RecordBatch::new_empty(schema) } else { - let batches = self - .reader + // When I/O is being measured, read through a reader whose + // scheduler also records into the per-query sink (a cheap + // clone sharing all cached metadata, no file re-open). + // Otherwise borrow the shared reader as-is, with no clone. + let reader = match &io_stats { + Some(io_stats) => { + Cow::Owned(self.reader.with_io_stats(io_stats.recorder())) + } + None => Cow::Borrowed(&self.reader), + }; + let batches = reader .read_stream( ReadBatchParams::Range(row_range), u32::MAX, @@ -1212,15 +1246,19 @@ impl IVFIndex { self.sub_index_metadata[partition_id].clone(), )?; let idx = S::load(batch)?; - let storage = self.load_partition_storage(partition_id).await?; + let storage = self.load_partition_storage(partition_id, io_stats).await?; Ok(PartitionEntry { index: idx, storage, }) } - pub async fn load_partition_storage(&self, partition_id: usize) -> Result { - self.storage.load_partition(partition_id).await + pub async fn load_partition_storage( + &self, + partition_id: usize, + io_stats: Option, + ) -> Result { + self.storage.load_partition(partition_id, io_stats).await } /// preprocess the query vector given the partition id. @@ -1800,6 +1838,10 @@ impl VectorIndex for IVFInd fn metric_type(&self) -> DistanceType { self.distance_type } + + fn open_io_stats(&self) -> Option { + Some(self.open_io_stats) + } } pub type IvfFlatIndex = IVFIndex; @@ -2723,7 +2765,7 @@ mod tests { async fn load_partition_row_ids(index: &IvfPq, partition_idx: usize) -> Vec { index .storage - .load_partition(partition_idx) + .load_partition(partition_idx, None) .await .unwrap() .row_ids() diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index c4c79dcee5e..0ceddf7c5ee 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -926,6 +926,9 @@ impl ExecutionPlan for ANNIvfPartitionExec { }) .buffered(self.index_uuids.len().min(target_partitions).max(1)) .finally(move || { + // Partition ranking reads centroids from memory, so this is + // typically zero; flushed for symmetry with ANNSubIndex. + metrics_clone.index_metrics.flush_io(); metrics_clone.baseline_metrics.done(); metrics_clone .baseline_metrics @@ -1627,6 +1630,9 @@ impl ExecutionPlan for ANNIvfSubIndexExec { // will not start until the early search is complete across all deltas. .try_flatten_unordered(None) .finally(move || { + // Publish the exact index-file I/O measured for this query + // (cache misses only) to the iops/requests/bytes_read gauges. + metrics_clone.index_metrics.flush_io(); metrics_clone .baseline_metrics .elapsed_compute() @@ -2919,6 +2925,128 @@ mod tests { assert_find_partitions_elapsed_recorded(&stats); } + /// The ANN operators report the exact index-file I/O performed for a query + /// (bytes_read / iops), measured only on cache misses. A cold search loads + /// partitions from storage and reports non-zero I/O; an immediately + /// following warm search serves every partition from the index cache and + /// reports zero -- which is the cache-effectiveness signal the metric adds. + #[tokio::test] + async fn test_io_metrics_cold_vs_warm() { + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + let run = |holder: &StatsHolder| { + let setter = holder.get_setter(); + async { + fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .scan_stats_callback(setter) + .project(&Vec::::new()) + .unwrap() + .with_row_id() + .try_into_batch() + .await + .unwrap() + } + }; + + // Cold: a freshly opened dataset has an empty index cache, so the + // sub-index search must read partitions (and their quantization storage) + // from disk. Those reads flow through the per-query I/O sink. + let cold_holder = StatsHolder::default(); + run(&cold_holder).await; + let cold = cold_holder.consume(); + assert!( + cold.parts_loaded > 0, + "cold search should load partitions, got parts_loaded={}", + cold.parts_loaded + ); + assert!( + cold.bytes_read > 0, + "cold search should report index-file I/O, got bytes_read={}", + cold.bytes_read + ); + assert!( + cold.iops > 0, + "cold search should report index-file IOPS, got iops={}", + cold.iops + ); + + // Warm: the same query on the same dataset finds every partition it + // needs already cached, so no index-file I/O is performed. + let warm_holder = StatsHolder::default(); + run(&warm_holder).await; + let warm = warm_holder.consume(); + assert_eq!( + warm.parts_loaded, 0, + "warm search should not reload partitions, got parts_loaded={}", + warm.parts_loaded + ); + assert_eq!( + warm.bytes_read, 0, + "warm search should report no index-file I/O, got bytes_read={}", + warm.bytes_read + ); + } + + /// The new I/O metrics must actually surface in `EXPLAIN ANALYZE` text on + /// the ANN operators: non-zero on a cold query (partition reads on + /// `ANNSubIndex`, index-open reads on `ANNIvfPartition`) and zero on a warm + /// query (everything served from the index cache). + #[tokio::test] + async fn test_io_metrics_visible_in_explain_analyze() { + // Returns the value of `metric=` from the analyzed-plan line for `node`. + fn node_metric<'a>(plan: &'a str, node: &str, metric: &str) -> &'a str { + let line = plan + .lines() + .find(|l| l.trim_start().starts_with(node)) + .unwrap_or_else(|| panic!("plan missing node {node}:\n{plan}")); + let after = line + .split_once(&format!("{metric}=")) + .unwrap_or_else(|| panic!("node {node} line missing {metric}=:\n{line}")) + .1; + after.split([',', ']']).next().unwrap().trim() + } + + let fixture = NprobesTestFixture::new(100, 1).await; + let q = fixture.get_centroid(0); + + // Cold: a freshly opened dataset must show real index-file I/O. + let cold = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + // Sub-index partition reads. + assert_ne!(node_metric(&cold, "ANNSubIndex", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNSubIndex", "iops"), "0"); + // Index-open reads (centroids/metadata) now attributed to the partition + // operator -- the value this part of the change adds. + assert_ne!(node_metric(&cold, "ANNIvfPartition", "bytes_read"), "0"); + assert_ne!(node_metric(&cold, "ANNIvfPartition", "iops"), "0"); + + // Warm: same query, everything cache-resident -> zero index-file I/O. + let warm = fixture + .dataset + .scan() + .nearest("vector", q.as_ref(), 10) + .unwrap() + .minimum_nprobes(10) + .analyze_plan() + .await + .unwrap(); + assert_eq!(node_metric(&warm, "ANNSubIndex", "bytes_read"), "0"); + assert_eq!(node_metric(&warm, "ANNIvfPartition", "bytes_read"), "0"); + } + #[rstest] #[tokio::test] async fn test_no_prefilter_results(#[values(1, 20)] num_deltas: usize) { diff --git a/rust/lance/src/io/exec/utils.rs b/rust/lance/src/io/exec/utils.rs index af3c5095f75..6e2d50d3736 100644 --- a/rust/lance/src/io/exec/utils.rs +++ b/rust/lance/src/io/exec/utils.rs @@ -6,7 +6,7 @@ use lance_datafusion::utils::{ IOPS_METRIC, PARTS_LOADED_METRIC, REQUESTS_METRIC, }; use lance_index::metrics::MetricsCollector; -use lance_io::scheduler::ScanScheduler; +use lance_io::scheduler::{IoStats, ScanScheduler, ScanStats}; use lance_table::format::IndexMetadata; use pin_project::pin_project; use std::future::Future; @@ -502,12 +502,17 @@ impl IoMetrics { } pub fn record(&self, scan_scheduler: &ScanScheduler) { - let current_stats = scan_scheduler.stats(); + self.record_stats(scan_scheduler.stats()); + } - // Use set_max to ensure gauge always shows the highest value seen - self.iops.set_max(current_stats.iops as usize); - self.requests.set_max(current_stats.requests as usize); - self.bytes_read.set_max(current_stats.bytes_read as usize); + /// Record a snapshot of cumulative I/O statistics. + /// + /// Uses `set_max` because the underlying counters are cumulative; the gauge + /// always reflects the highest (i.e. final) value seen. + pub fn record_stats(&self, stats: ScanStats) { + self.iops.set_max(stats.iops as usize); + self.requests.set_max(stats.requests as usize); + self.bytes_read.set_max(stats.bytes_read as usize); } } @@ -516,6 +521,12 @@ pub struct IndexMetrics { indices_loaded: Count, parts_loaded: Count, index_comparisons: Count, + /// Per-query sink that accumulates exact index-file I/O as partitions are + /// loaded from storage. Shared by all clones of this `IndexMetrics`, so + /// concurrent partition loads all funnel into the same counters. Published + /// to `io_metrics` for display via [`IndexMetrics::flush_io`]. + io_stats: IoStats, + io_metrics: IoMetrics, } impl IndexMetrics { @@ -524,8 +535,18 @@ impl IndexMetrics { indices_loaded: metrics.new_count(INDICES_LOADED_METRIC, partition), parts_loaded: metrics.new_count(PARTS_LOADED_METRIC, partition), index_comparisons: metrics.new_count(INDEX_COMPARISONS_METRIC, partition), + io_stats: IoStats::new(), + io_metrics: IoMetrics::new(metrics, partition), } } + + /// Publish the I/O accumulated in the per-query sink to the displayed + /// `iops`/`requests`/`bytes_read` metrics. Call once when the operator's + /// stream finishes; the sink only accumulates on cache misses, so a fully + /// cache-resident query publishes zeros. + pub fn flush_io(&self) { + self.io_metrics.record_stats(self.io_stats.snapshot()); + } } impl MetricsCollector for IndexMetrics { @@ -538,6 +559,9 @@ impl MetricsCollector for IndexMetrics { fn record_comparisons(&self, num_comparisons: usize) { self.index_comparisons.add(num_comparisons); } + fn io_stats(&self) -> Option { + Some(self.io_stats.clone()) + } } #[cfg(test)] From e87bae10b85534aaf399abffd7c00bad99599304 Mon Sep 17 00:00:00 2001 From: ForwardXu Date: Thu, 11 Jun 2026 18:09:50 +0800 Subject: [PATCH 085/177] docs(object_store): add Tencent Cloud COS and GooseFS configuration (#7151) --- docs/src/guide/object_store.md | 205 +++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) diff --git a/docs/src/guide/object_store.md b/docs/src/guide/object_store.md index 182b93c0574..f901d2c2411 100644 --- a/docs/src/guide/object_store.md +++ b/docs/src/guide/object_store.md @@ -248,3 +248,208 @@ ds = lance.dataset( | `tos_access_key_id` | Access key ID used for TOS authentication. Optional if credentials are provided by environment. | | `tos_secret_access_key` | Secret access key used for TOS authentication. Optional if credentials are provided by environment. | | `tos_security_token` | Security token for temporary credentials. Optional. | + +## Tencent Cloud COS Configuration + +[COS (Cloud Object Storage)](https://cloud.tencent.com/product/cos) credentials can be set in environment variables prefixed +with `COS_` or `TENCENTCLOUD_` (for example, `COS_ENDPOINT`, `COS_SECRET_ID`, +`COS_SECRET_KEY`, `TENCENTCLOUD_REGION`, `TENCENTCLOUD_SECURITY_TOKEN`). +Alternatively, credentials can be passed as parameters to the `storage_options` +parameter; explicit `storage_options` override environment variables: + +=== "Python" + + ```python + import lance + ds = lance.dataset( + "cos://bucket/path", + storage_options={ + "cos_endpoint": "https://cos.ap-guangzhou.myqcloud.com", + "cos_secret_id": "my-secret-id", + "cos_secret_key": "my-secret-key", + } + ) + ``` + +=== "Rust" + + In this Lance distribution, `tencent` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `tencent` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["tencent", ...] }`. + - You depend on `lance-io` directly (without `lance`); `tencent` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["tencent"] }`. + +| Key | Description | +|-----|-------------| +| `cos_endpoint` | COS endpoint. Required (for example, `https://cos.ap-guangzhou.myqcloud.com`). Can also be set via the `COS_ENDPOINT` environment variable. | +| `cos_secret_id` | Secret ID used for COS authentication. Optional if credentials are provided by environment. | +| `cos_secret_key` | Secret key used for COS authentication. Optional if credentials are provided by environment. | +| `cos_enable_versioning` | Whether to enable object versioning on the bucket. Optional. | + +!!! note + + The OpenDAL `CosConfig` currently exposes a limited set of options. Additional + settings such as the security token (`TENCENTCLOUD_SECURITY_TOKEN`) and region + (`TENCENTCLOUD_REGION`) must be configured via environment variables. + +## GooseFS Configuration + +[GooseFS](https://cloud.tencent.com/product/goosefs) is a distributed caching +filesystem. Lance accesses GooseFS through its Master gRPC service. The URL format +is `goosefs://host:port/path`, where `host:port` is the GooseFS Master address +(default port: `9200`, may be omitted, e.g. `goosefs://10.0.0.1/path`) and +`/path` is the filesystem path within GooseFS. + +!!! note "About the dataset path" + + `/path` is just an arbitrary directory inside GooseFS — Lance does **not** + require the path to end with a `.lance` suffix. Any valid GooseFS directory + works, for example: + + - `goosefs://10.0.0.1:9200/data/my-dataset` + - `goosefs://10.0.0.1:9200/data/my-dataset.lance` + - `goosefs://10.0.0.1:9200/lance-test/lance-io` + + The `.lance` suffix used in the examples below is only a naming convention + that makes it easy to recognize a Lance dataset directory at a glance; it + has no special meaning to Lance itself. The only requirement is that the + same path is used consistently for reads and writes of a given dataset. + +=== "Python" + + ```python + import lance + + ds = lance.dataset( + "goosefs://10.0.0.1:9200/data/my-dataset.lance", + storage_options={ + "goosefs_auth_type": "simple", + "goosefs_auth_username": "lance", + }, + ) + ``` + +=== "Rust" + + In this Lance distribution, `goosefs` is already part of the **default + features** of the `lance` crate, so simply depending on `lance` is enough: + + ```toml + [dependencies] + lance = "*" + ``` + + You only need to enable the `goosefs` feature explicitly in the following + cases: + + - You opted out of default features, e.g. + `lance = { version = "*", default-features = false, features = ["goosefs", ...] }`. + - You depend on `lance-io` directly (without `lance`); `goosefs` is **not** + a default feature of `lance-io`: + `lance-io = { version = "*", features = ["goosefs"] }`. + + Open the underlying `lance_io::object_store::ObjectStore` directly (mirrors + the integration test in `rust/lance-io/tests/goosefs_integration.rs`): + + ```rust + use lance_io::object_store::ObjectStore; + + let uri = "goosefs://10.0.0.1:9200/lance-test/lance-io"; + let (store, path) = ObjectStore::from_uri(uri).await?; + + // Read / write through the underlying `object_store::ObjectStore` API + store.inner.put(&path, (&b"hello"[..]).into()).await?; + let result = store.inner.get(&path).await?; + let bytes = result.bytes().await?; + ``` + + Open a Lance dataset with custom storage options: + + ```rust + use std::collections::HashMap; + use lance::dataset::DatasetBuilder; + + let mut storage_options = HashMap::new(); + storage_options.insert("goosefs_master_addr".to_string(), "10.0.0.1:9200".to_string()); + storage_options.insert("goosefs_auth_type".to_string(), "simple".to_string()); + storage_options.insert("goosefs_auth_username".to_string(), "lance".to_string()); + + let dataset = DatasetBuilder::from_uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .with_storage_options(storage_options) + .load() + .await?; + ``` + +=== "Java" + + Pass the GooseFS configuration through `ReadOptions.setStorageOptions` + when opening the dataset: + + ```java + import org.lance.Dataset; + import org.lance.ReadOptions; + + import java.util.HashMap; + import java.util.Map; + + Map storageOptions = new HashMap<>(); + storageOptions.put("goosefs_master_addr", "10.0.0.1:9200"); + storageOptions.put("goosefs_auth_type", "simple"); + storageOptions.put("goosefs_auth_username", "lance"); + + ReadOptions options = new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .build(); + + try (Dataset dataset = Dataset.open() + .uri("goosefs://10.0.0.1:9200/data/my-dataset.lance") + .readOptions(options) + .build()) { + // ... use the dataset + } + ``` + + For writes, the same `storageOptions(...)` setter is available on + `WriteDatasetBuilder` and `WriteFragmentBuilder`. + +The Master address can be resolved from (in priority order): + +1. The `goosefs_master_addr` storage option (supports HA: `"addr1:port,addr2:port"`). +2. The `GOOSEFS_MASTER_ADDR` environment variable. +3. The host and port from the URL authority. + +The following keys can be used as both environment variables or keys in the +`storage_options` parameter: + +| Key | Description | +|-----|-------------| +| `goosefs_master_addr` / `GOOSEFS_MASTER_ADDR` | GooseFS Master address. Supports a single address (`host:port`) or comma-separated HA addresses (`addr1:port,addr2:port`). Optional if the address is provided in the URL. | +| `goosefs_write_type` / `GOOSEFS_WRITE_TYPE` | Write type, e.g. `MUST_CACHE`, `CACHE_THROUGH`, `THROUGH`, `ASYNC_THROUGH`. Optional. | +| `goosefs_block_size` / `GOOSEFS_BLOCK_SIZE` | GooseFS block size in bytes (this is the GooseFS-side block size, not Lance's I/O block size). Optional. | +| `goosefs_chunk_size` / `GOOSEFS_CHUNK_SIZE` | Chunk size in bytes used when reading or writing files. Optional. | +| `goosefs_auth_type` / `GOOSEFS_AUTH_TYPE` | Authentication type. Either `nosasl` or `simple` (case-insensitive; the value is passed through to OpenDAL). Optional. | +| `goosefs_auth_username` / `GOOSEFS_AUTH_USERNAME` | Username used in `simple` authentication mode. Optional. | + +!!! note "Running the GooseFS integration tests" + + The Rust integration tests for GooseFS live at + `rust/lance-io/tests/goosefs_integration.rs` and are gated behind feature + flags. They require a reachable GooseFS cluster (configured via the + `GOOSEFS_MASTER_ADDR` and `GOOSEFS_AUTH_TYPE` environment variables) and + can be run with: + + ```bash + cargo test -p lance-io --features "goosefs goosefs-test" \ + --test goosefs_integration -- --ignored --nocapture --test-threads=1 + ``` From bb9b4638ec1c76763a03ae290e367851a956eeb3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 08:01:45 -0700 Subject: [PATCH 086/177] chore(deps): bump lance-namespace from 0.8.0 to 0.8.4 in /python (#7218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > [!WARNING] > Dependabot will stop supporting `python v3.9`! > > Please upgrade to one of the following versions: `v3.9`, `v3.10`, `v3.11`, `v3.12`, `v3.13`, or `v3.14`. > Bumps [lance-namespace](https://github.com/lance-format/lance-namespace) from 0.8.0 to 0.8.4.
Release notes

Sourced from lance-namespace's releases.

v0.8.4

What's Changed

New Features 🎉

Full Changelog: https://github.com/lance-format/lance-namespace/compare/v0.8.2...v0.8.4

v0.8.2

What's Changed

New Features 🎉

Bug Fixes 🐛

New Contributors

Full Changelog: https://github.com/lance-format/lance-namespace/compare/v0.8.1...v0.8.2

Commits
  • 4f61cc7 chore: release version 0.8.4
  • abdfb60 chore: release version 0.8.3-beta.1
  • 378bc3c feat: define nested field path contract (#351)
  • d626153 feat: enrich IndexContent with describe_indices metadata (#349)
  • b3e5efd chore: release version 0.8.2
  • fa0fe85 feat: add table branch operations and branch parameter (#350)
  • a8c1ad3 fix: restore CreateEmptyTable deprecated shims for backwards compatibility (#...
  • dc609bd chore: release version 0.8.1
  • 5d3305a feat: add update_field_metadata operation (#347)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=lance-namespace&package-manager=uv&previous-version=0.8.0&new-version=0.8.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/uv.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index 314417f5aa1..428578ab26f 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1083,19 +1083,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.8.0" +version = "0.8.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/80/2b6eaa08c5e25915acaa6368a70211a25b5ba9d2d6006450e68a73936164/lance_namespace-0.8.0.tar.gz", hash = "sha256:c4a79ee221a3b2315c29863ad12d85fcf219a13158e26149d63e21dc4b4673a7", size = 10756, upload-time = "2026-06-01T08:47:10.183Z" } +sdist = { url = "https://files.pythonhosted.org/packages/48/8f/8a03395587a78cfaf92f7307ad931f61eb515af67705c704bd6c7af2f745/lance_namespace-0.8.4.tar.gz", hash = "sha256:1a54ad49e7ace25a629c5f2c99d393629742eceeeb16ba2f51a771ccb350e284", size = 11282, upload-time = "2026-06-10T19:07:21.919Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/bd/7b40a08fb132fab39a6caebf832fdf6b9befc71be9413beb9be0a9d927d4/lance_namespace-0.8.0-py3-none-any.whl", hash = "sha256:782cf9e332f46bf06836722dd98b53ca8495ad98bb541501ff6876c89b67ec90", size = 12579, upload-time = "2026-06-01T08:47:10.91Z" }, + { url = "https://files.pythonhosted.org/packages/fd/4b/218c67cafb707024069925ce86534588861a464aaa327f7a457b94eed3c2/lance_namespace-0.8.4-py3-none-any.whl", hash = "sha256:8b347eef4b7c7187a1b52f388b5dcc345fed0bf4ea87728188dcb11a52619d0b", size = 13111, upload-time = "2026-06-10T19:07:22.6Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.8.0" +version = "0.8.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -1104,9 +1104,9 @@ dependencies = [ { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8c/37/06fcd5a8969381e0ba953d51990af8d331bdccbc62458bf2eed30d064573/lance_namespace_urllib3_client-0.8.0.tar.gz", hash = "sha256:4f060f05ebf3c04aeaeb0d2022cbe77648a3df290f02cd2c305e5797d0fc1fdd", size = 203710, upload-time = "2026-06-01T08:47:13.404Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/55/4a7cc7e5d19bda170c896a6adff2ec925c533df812b91bce2bc8f7aea30b/lance_namespace_urllib3_client-0.8.4.tar.gz", hash = "sha256:1a292a83509ab79475da967b78839e9ead4ab973064d37d1ba1575b23ffdacef", size = 228485, upload-time = "2026-06-10T19:07:19.863Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/51/43/e280727feee958f303bc58d5fa912b07734a0831f756d841654d500c2c34/lance_namespace_urllib3_client-0.8.0-py3-none-any.whl", hash = "sha256:6734e341b726e5cc96a0cd257cef27eb9d03013f2d151526ee426cef8e63e228", size = 336669, upload-time = "2026-06-01T08:47:11.88Z" }, + { url = "https://files.pythonhosted.org/packages/b4/f7/70dd2fc1f9ef462d3802b4cffcd64f2b9233a9907d6071e8694338492608/lance_namespace_urllib3_client-0.8.4-py3-none-any.whl", hash = "sha256:37ee1d74614fae6358f50e3589ac26c29379ffb1346f09c4f5ec8953f823cefd", size = 369807, upload-time = "2026-06-10T19:07:21.001Z" }, ] [[package]] From 717346813d1fcd7808c6bf678e977bc621687b02 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 11 Jun 2026 10:26:00 -0700 Subject: [PATCH 087/177] perf(dir-catalog): rewrite manifest mutations with copy-on-write (#7176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Replace `__manifest` merge-insert/delete maintenance with always copy-on-write rewrites: each mutation streams the latest `__manifest` into a single replacement data file and commits a new manifest version with freshly built replacement scalar indices (`object_id` BTree, `object_type` Bitmap, `base_objects` LabelList), so reads stay index-accelerated. - Commit the rewrite by writing the manifest **directly** via the storage commit handler — atomic put-if-not-exists at version N+1, version hint, and the overwrite transaction embedded **inline** (no `_transactions/*.txn` file). No `CommitBuilder`/transaction-rebase machinery, and **no new lance-core API surface** (kept entirely in `lance-namespace-impls`). - **Native conflict handling**: on a storage commit error, read back the latest version and check whether the intent is already satisfied (create → object now exists ⇒ fail; delete → object gone ⇒ succeed) before retrying. Staged data/index files are deleted only once the commit is proven not to have landed, so a lost-ack commit can no longer orphan files a committed manifest references (fixes a silent corruption window). - Delete table-version ranges by streaming the rewrite, without expanding every version id. - Adds a commit benchmark: `examples/manifest_bench.rs` + `benches/manifest_commit_sweep.sh` (see `BENCHMARK.md`). This is the production implementation of the copy-on-write approach prototyped in #6794. ## Measured performance `c7i.48xlarge`, S3 `us-east-1`, op `write-create-namespace` (pure `__manifest` commit). The catalog is single-writer-throughput-bound: per-commit cost scales ~O(rows), and throughput does **not** increase with concurrency (every commit is a serialized manifest version bump). Continuous (1 process, 100 commits), ops/s — inline index vs no index: | rows | inline index | no index | |---:|---:|---:| | 1,000 | 2.0 | 3.5 | | 100,000 | 1.1 | 2.1 | | 1,000,000 | 0.34 | 0.53 | Concurrent steady TPS is flat across 10–200 processes (e.g. inline @100k ≈ 1.4–1.5 ops/s at every concurrency level; @1M ≈ 0.3 ops/s). Conflicts beyond the retry budget surface as errors that grow with concurrency — the contention ceiling, not data loss (≈0 errors at ≤20 processes). No-index commits run ~1.5–2× faster (no per-commit index build). This is on par with the #6794 prototype's ~2.3–2.5 ops/s single-process S3 write throughput. --- Cargo.lock | 4 + python/Cargo.lock | 4 + rust/lance-namespace-datafusion/tests/sql.rs | 2 + rust/lance-namespace-impls/BENCHMARK.md | 73 + rust/lance-namespace-impls/Cargo.toml | 9 + .../benches/manifest_commit_sweep.sh | 146 + .../examples/manifest_bench.rs | 714 ++++ rust/lance-namespace-impls/src/dir.rs | 64 +- .../lance-namespace-impls/src/dir/manifest.rs | 2976 +++++++++++++---- 9 files changed, 3400 insertions(+), 592 deletions(-) create mode 100644 rust/lance-namespace-impls/BENCHMARK.md create mode 100755 rust/lance-namespace-impls/benches/manifest_commit_sweep.sh create mode 100644 rust/lance-namespace-impls/examples/manifest_bench.rs diff --git a/Cargo.lock b/Cargo.lock index 866eb9b4b0e..75b7f902d7a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5028,6 +5028,8 @@ dependencies = [ "base64 0.22.1", "bytes", "chrono", + "datafusion-common", + "datafusion-physical-plan", "futures", "hmac 0.12.1", "lance", @@ -5045,6 +5047,7 @@ dependencies = [ "rand 0.9.4", "reqwest 0.12.28", "ring", + "roaring", "rstest", "rustls-pki-types", "serde", @@ -5055,6 +5058,7 @@ dependencies = [ "tower", "tower-http 0.5.2", "url", + "uuid", "wiremock", ] diff --git a/python/Cargo.lock b/python/Cargo.lock index 879195811cf..5a6fb26be91 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4560,6 +4560,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4572,12 +4574,14 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] diff --git a/rust/lance-namespace-datafusion/tests/sql.rs b/rust/lance-namespace-datafusion/tests/sql.rs index e49cd7e58e3..5332e831cb6 100755 --- a/rust/lance-namespace-datafusion/tests/sql.rs +++ b/rust/lance-namespace-datafusion/tests/sql.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +#![recursion_limit = "256"] + use std::sync::Arc; use arrow_array::{Int32Array, Int64Array, RecordBatch, RecordBatchIterator, StringArray}; diff --git a/rust/lance-namespace-impls/BENCHMARK.md b/rust/lance-namespace-impls/BENCHMARK.md new file mode 100644 index 00000000000..074ec303347 --- /dev/null +++ b/rust/lance-namespace-impls/BENCHMARK.md @@ -0,0 +1,73 @@ +# `__manifest` commit benchmark + +Measures how fast the copy-on-write directory catalog commits `__manifest` mutations as +the manifest scales, with the inline scalar indices on or off. + +The catalog commits every mutation by rewriting the whole `__manifest` (copy-on-write) +and atomically writing a new manifest version. This benchmark characterises: + +- **Continuous commit** — a single process commits `N` times into a manifest already + holding `rows` entries (per-commit latency + throughput). +- **Concurrent commit** — `C` processes commit continuously for a fixed duration against + a manifest of `rows` entries (steady, contended TPS). + +## Binary: `examples/manifest_bench.rs` + +``` +manifest_bench seed-large --root --count --inline-optimization \ + [--storage-option aws_region=us-east-1] +manifest_bench run --root --operation write-create-namespace \ + --concurrency 1 --operations 100 --initial-entries --inline-optimization # continuous +manifest_bench run --root --operation write-create-namespace \ + --concurrency 50 --duration-secs 30 --initial-entries --inline-optimization # concurrent +``` + +- `seed-large` bootstraps a manifest to `count` rows by writing the Lance dataset + directly (O(rows) once) and then triggering one CoW rewrite so the on-disk state + matches the steady catalog form (single fragment; inline indices when enabled). +- `run` spawns `--concurrency` worker subprocesses. With `--operations` it runs a fixed + commit budget (continuous); with `--duration-secs` each worker commits until the + deadline (steady TPS). It prints one JSON `BenchResult` per concurrency level with + throughput and p50/p90/p99 latency. +- The committed operation (`--operation`) defaults to `write-create-namespace`, the + cheapest pure-`__manifest` mutation (no table data). `write-create-table` / + `write-declare-table` are also available. + +S3 requires the default `dir-aws` feature (on by default) and AWS credentials in the +environment; pass `--storage-option aws_region=`. + +## Sweep panel: `benches/manifest_commit_sweep.sh` + +Runs the full panel — sizes × {inline index, no index} × {continuous, concurrent×C} — +with per-run S3-copy isolation (each run starts at exactly the bootstrapped size), +JSONL results, a `summary.csv`, and resume support. + +```bash +cargo build --release --example manifest_bench -p lance-namespace-impls +S3_BASE=s3:///manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ + rust/lance-namespace-impls/benches/manifest_commit_sweep.sh +``` + +Default panel (override via env): `SIZES="1000 2000 5000 10000 20000 50000 100000 200000 +500000 1000000"`, `CONCURRENCY="10 20 50 100 120 150 200"`, `INLINE_VARIANTS="true false"`, +`CONT_OPS=100`, `CONC_DURATION_SECS=30`. Results land in `$OUT_DIR` (default +`~/manifest_cow_bench_`). + +## Representative results + +EC2 `c7i.48xlarge`, S3 `us-east-1`, op `write-create-namespace`. The catalog is a +single-writer-throughput system: per-commit cost scales ~O(rows) and throughput does **not** +scale with concurrency (every commit is a serialized `__manifest` version bump). + +Continuous (1 process, 100 commits), ops/s — inline index vs no index: + +| rows | inline | no index | +|---:|---:|---:| +| 1,000 | 2.0 | 3.5 | +| 100,000 | 1.1 | 2.1 | +| 1,000,000 | 0.34 | 0.53 | + +Concurrent steady TPS is flat across C=10..200 (e.g. inline @100k ≈ 1.4–1.5 ops/s at every C; +@1M ≈ 0.3 ops/s). Conflicts that exceed the retry budget surface as errors and grow with C +(≈0 at C≤20, climbing at C≥100) — the contention ceiling, not data loss. No-index commits run +~1.5–2× faster (no per-commit index build) at the cost of unindexed reads. diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 53ff79fb333..c2bf057ee21 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -51,6 +51,8 @@ object_store = { workspace = true } arrow = { workspace = true } arrow-ipc = { workspace = true } arrow-schema = { workspace = true } +datafusion-common = { workspace = true } +datafusion-physical-plan = { workspace = true } # REST adapter implementation dependencies (optional, enabled by "rest-adapter" feature) axum = { workspace = true, optional = true } @@ -66,6 +68,8 @@ serde_json = { workspace = true } futures.workspace = true log.workspace = true rand.workspace = true +roaring.workspace = true +uuid.workspace = true # Shared credential vending dependencies sha2 = { version = "0.10", optional = true } @@ -96,6 +100,11 @@ rstest.workspace = true lance-table.workspace = true lance-arrow = { workspace = true } lance = { workspace = true } +serde = { workspace = true, features = ["derive"] } + +[[example]] +name = "manifest_bench" +path = "examples/manifest_bench.rs" [lints] workspace = true diff --git a/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh new file mode 100755 index 00000000000..7384ced4152 --- /dev/null +++ b/rust/lance-namespace-impls/benches/manifest_commit_sweep.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +# Copy-on-write __manifest commit benchmark sweep panel. +# +# Drives `cargo run --release --example manifest_bench` across a panel of: +# - bootstrap manifest sizes (rows already in __manifest) +# - inline scalar indices on vs off +# - continuous commit (single process, N commits) and +# concurrent commit (C processes, steady TPS over a fixed duration) +# +# Each run is isolated: a "golden" manifest is bootstrapped once per (size, index) +# and server-side-copied to a fresh S3 prefix per run, so every run starts at exactly +# the bootstrapped size. Results are written as JSONL (one BenchResult per line) and +# summarised to CSV. The sweep is resumable: completed runs are skipped. +# +# Usage: +# S3_BASE=s3://jack-devland-build/manifest-cow-bench/$(date -u +%Y%m%dT%H%M%SZ) \ +# ./manifest_commit_sweep.sh +# +# Env knobs (defaults match the requested panel): +# SIZES, CONCURRENCY, INLINE_VARIANTS, CONT_OPS, CONC_DURATION_SECS, +# AWS_REGION, OUT_DIR, BIN +# +# Resilient by design: a single failed run is logged and skipped rather than aborting +# the sweep, and re-running fills the gaps (completed runs are detected and skipped). +set -uo pipefail + +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" +S3_BASE="${S3_BASE:?set S3_BASE, e.g. s3://jack-devland-build/manifest-cow-bench/$RUN_ID}" +AWS_REGION="${AWS_REGION:-us-east-1}" +export AWS_REGION AWS_DEFAULT_REGION="$AWS_REGION" + +REPO_ROOT="${REPO_ROOT:-$HOME/oss/lance}" +BIN="${BIN:-$REPO_ROOT/target/release/examples/manifest_bench}" +OUT_DIR="${OUT_DIR:-$HOME/manifest_cow_bench_${RUN_ID}}" +RESULTS="$OUT_DIR/results.jsonl" +PROGRESS="$OUT_DIR/progress.log" +mkdir -p "$OUT_DIR" + +SIZES=(${SIZES:-1000 2000 5000 10000 20000 50000 100000 200000 500000 1000000}) +CONCURRENCY=(${CONCURRENCY:-10 20 50 100 120 150 200}) +INLINE_VARIANTS=(${INLINE_VARIANTS:-true false}) +CONT_OPS="${CONT_OPS:-100}" +CONC_DURATION_SECS="${CONC_DURATION_SECS:-30}" +STORAGE_OPT=(--storage-option "aws_region=${AWS_REGION}") + +log() { printf '%s %s\n' "$(date -u +%H:%M:%S)" "$*" | tee -a "$PROGRESS"; } + +# Skip a run if its tag already appears in results.jsonl (resume support). +done_already() { grep -q "\"bench_tag\":\"$1\"" "$RESULTS" 2>/dev/null; } + +# Append a result line, tagging it so reruns can resume and we can pivot later. +record() { + local tag="$1"; shift + # shellcheck disable=SC2016 + python3 -c 'import json,sys; d=json.load(sys.stdin); d["bench_tag"]=sys.argv[1]; print(json.dumps(d))' \ + "$tag" >> "$RESULTS" +} + +s3_copy() { aws s3 cp --recursive --quiet "$1" "$2" --region "$AWS_REGION"; } +s3_rm() { aws s3 rm --recursive --quiet "$1" --region "$AWS_REGION" || true; } + +# Backstops for unattended runs: cap any single run and clear leaked worker processes +# (a killed coordinator can orphan its worker children) before the next run. +RUN_TIMEOUT="${RUN_TIMEOUT:-1200}" +clear_stragglers() { pkill -f 'examples/manifest_bench worker' 2>/dev/null || true; sleep 1; } + +for inline in "${INLINE_VARIANTS[@]}"; do + for rows in "${SIZES[@]}"; do + golden="${S3_BASE}/golden/inline_${inline}_rows_${rows}" + boot_tag="boot_inline_${inline}_rows_${rows}" + + if ! done_already "$boot_tag"; then + log "BOOTSTRAP inline=$inline rows=$rows -> $golden" + s3_rm "$golden" + if "$BIN" seed-large --root "$golden" --count "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}"; then + echo "{\"bench_tag\":\"$boot_tag\"}" >> "$RESULTS" + else + log "BOOTSTRAP FAILED inline=$inline rows=$rows (skipping this size)" + continue + fi + else + log "skip bootstrap $boot_tag (done)" + fi + + # ---- Continuous: single process, CONT_OPS commits ---- + cont_tag="cont_inline_${inline}_rows_${rows}" + if ! done_already "$cont_tag"; then + run_prefix="${S3_BASE}/run/${cont_tag}" + log "CONTINUOUS inline=$inline rows=$rows ops=$CONT_OPS" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency 1 --operations "$CONT_OPS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$cont_tag" <<<"$line"; done + s3_rm "$run_prefix" + else + log "skip continuous $cont_tag (done)" + fi + + # ---- Concurrent: C processes, steady TPS over CONC_DURATION_SECS ---- + for c in "${CONCURRENCY[@]}"; do + conc_tag="conc_inline_${inline}_rows_${rows}_c_${c}" + if done_already "$conc_tag"; then log "skip concurrent $conc_tag (done)"; continue; fi + run_prefix="${S3_BASE}/run/${conc_tag}" + log "CONCURRENT inline=$inline rows=$rows c=$c dur=${CONC_DURATION_SECS}s" + clear_stragglers + s3_copy "$golden" "$run_prefix" + timeout "$RUN_TIMEOUT" "$BIN" run --root "$run_prefix" --operation write-create-namespace \ + --concurrency "$c" --duration-secs "$CONC_DURATION_SECS" --initial-entries "$rows" \ + --inline-optimization "$inline" "${STORAGE_OPT[@]}" \ + 2>>"$PROGRESS" | while read -r line; do record "$conc_tag" <<<"$line"; done + s3_rm "$run_prefix" + done + done +done + +# ---- Summarise to CSV ---- +CSV="$OUT_DIR/summary.csv" +python3 - "$RESULTS" "$CSV" <<'PY' +import json, sys, csv +rows = [] +with open(sys.argv[1]) as f: + for line in f: + d = json.loads(line) + if "throughput_ops_per_sec" not in d: + continue # bootstrap marker + mode = "continuous" if d["duration_secs"] == 0 else "concurrent" + rows.append({ + "mode": mode, "variant": d["variant"], "initial_entries": d["initial_entries"], + "concurrency": d["concurrency"], "duration_secs": d["duration_secs"], + "ops": d["total_operations"], "errors": d["errors"], + "tps": round(d["throughput_ops_per_sec"], 3), + "avg_ms": round(d["avg_latency_ms"], 2), "p50_ms": round(d["p50_latency_ms"], 2), + "p90_ms": round(d["p90_latency_ms"], 2), "p99_ms": round(d["p99_latency_ms"], 2), + }) +rows.sort(key=lambda r: (r["mode"], r["variant"], r["initial_entries"], r["concurrency"])) +with open(sys.argv[2], "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=list(rows[0].keys()) if rows else []) + w.writeheader(); w.writerows(rows) +print(f"wrote {len(rows)} rows to {sys.argv[2]}") +PY + +log "SWEEP COMPLETE. Results: $RESULTS Summary: $CSV" +s3_rm "${S3_BASE}/golden" "${S3_BASE}/run" 2>/dev/null || true diff --git a/rust/lance-namespace-impls/examples/manifest_bench.rs b/rust/lance-namespace-impls/examples/manifest_bench.rs new file mode 100644 index 00000000000..4841f2471d7 --- /dev/null +++ b/rust/lance-namespace-impls/examples/manifest_bench.rs @@ -0,0 +1,714 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Copy-on-write `__manifest` directory-catalog commit benchmark (S3 capable). +//! +//! Measures how fast the directory catalog commits `__manifest` mutations as the +//! manifest scales, with the inline scalar indices on or off. +//! +//! Modes: +//! seed-large — bootstrap a `__manifest` with N rows (direct dataset write + one +//! CoW rewrite to build indices) +//! run — coordinator: spawn `--concurrency` worker processes committing for +//! either a fixed op count (continuous) or a fixed duration (steady TPS) +//! worker — (internal) a single committing process spawned by `run` +//! +//! Examples: +//! # Bootstrap 100k rows with inline indices +//! manifest_bench seed-large --root s3://bucket/bench/p --count 100000 \ +//! --inline-optimization true --storage-option aws_region=us-east-1 +//! +//! # Continuous: 100 commits, single process +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 1 --operations 100 --initial-entries 100000 --inline-optimization true +//! +//! # Concurrent steady TPS: 50 processes committing for 30s +//! manifest_bench run --root s3://bucket/bench/p --operation write-create-namespace \ +//! --concurrency 50 --duration-secs 30 --initial-entries 100000 --inline-optimization true + +// A CLI benchmark tool: workers emit JSON latency records on stdout and progress on +// stderr, so stdout/stderr printing is intentional here. +#![allow(clippy::print_stdout, clippy::print_stderr)] + +use std::collections::HashMap; +use std::io::{BufRead, BufReader}; +use std::process::{Command, Stdio}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arrow::array::builder::{ListBuilder, StringBuilder}; +use arrow::array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use bytes::Bytes; +use lance::dataset::{InsertBuilder, WriteMode, WriteParams}; +use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; +use lance_namespace::LanceNamespace; +use lance_namespace::models::{ + CreateNamespaceRequest, CreateTableRequest, DeclareTableRequest, DescribeTableRequest, + ListNamespacesRequest, ListTablesRequest, +}; +use lance_namespace_impls::DirectoryNamespaceBuilder; +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Clone)] +struct LatencyRecord { + operation: String, + latency_ms: f64, + error: bool, +} + +#[derive(Serialize)] +struct BenchResult { + variant: String, + operation: String, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + total_operations: usize, + total_duration_ms: f64, + throughput_ops_per_sec: f64, + avg_latency_ms: f64, + p50_latency_ms: f64, + p90_latency_ms: f64, + p99_latency_ms: f64, + min_latency_ms: f64, + max_latency_ms: f64, + errors: usize, +} + +fn percentile(sorted: &[f64], p: f64) -> f64 { + if sorted.is_empty() { + return 0.0; + } + let idx = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[idx.min(sorted.len() - 1)] +} + +#[allow(clippy::too_many_arguments)] +fn compute_result( + variant: &str, + operation: &str, + concurrency: usize, + initial_entries: usize, + duration_secs: u64, + wall_duration: Duration, + mut latencies: Vec, + errors: usize, +) -> BenchResult { + latencies.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let total = latencies.len(); + let total_ms = wall_duration.as_secs_f64() * 1000.0; + let throughput = if total_ms > 0.0 { + total as f64 / (total_ms / 1000.0) + } else { + 0.0 + }; + BenchResult { + variant: variant.to_string(), + operation: operation.to_string(), + concurrency, + initial_entries, + duration_secs, + total_operations: total, + total_duration_ms: total_ms, + throughput_ops_per_sec: throughput, + avg_latency_ms: if total > 0 { + latencies.iter().sum::() / total as f64 + } else { + 0.0 + }, + p50_latency_ms: percentile(&latencies, 0.50), + p90_latency_ms: percentile(&latencies, 0.90), + p99_latency_ms: percentile(&latencies, 0.99), + min_latency_ms: latencies.first().copied().unwrap_or(0.0), + max_latency_ms: latencies.last().copied().unwrap_or(0.0), + errors, + } +} + +fn create_test_ipc_data() -> Vec { + use arrow::array::Int32Array; + use arrow_ipc::writer::StreamWriter; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + buffer +} + +/// The `__manifest` schema used by the copy-on-write directory catalog: +/// `object_id`, `object_type`, `location`, `metadata` (Utf8), `base_objects` (List). +fn manifest_schema() -> Arc { + Arc::new(ArrowSchema::new(vec![ + Field::new("object_id", DataType::Utf8, false).with_metadata( + [( + LANCE_UNENFORCED_PRIMARY_KEY_POSITION.to_string(), + "0".to_string(), + )] + .into_iter() + .collect(), + ), + Field::new("object_type", DataType::Utf8, false), + Field::new("location", DataType::Utf8, true), + Field::new("metadata", DataType::Utf8, true), + Field::new( + "base_objects", + DataType::List(Arc::new(Field::new("object_id", DataType::Utf8, true))), + true, + ), + ])) +} + +async fn build_namespace( + root: &str, + inline_optimization: bool, + storage_options: &HashMap, +) -> Box { + let mut properties = HashMap::new(); + properties.insert("root".to_string(), root.to_string()); + properties.insert("dir_listing_enabled".to_string(), "false".to_string()); + properties.insert( + "inline_optimization_enabled".to_string(), + inline_optimization.to_string(), + ); + for (k, v) in storage_options { + properties.insert(format!("storage.{}", k), v.clone()); + } + let builder = DirectoryNamespaceBuilder::from_properties(properties, None) + .expect("Failed to create namespace builder from properties"); + Box::new(builder.build().await.expect("Failed to build namespace")) +} + +// ──────────────────── seed-large mode ──────────────────── +// Bootstrap a `__manifest` with N rows by writing the Lance dataset directly (fast, +// O(N) once), then trigger a single CoW rewrite via the namespace so the on-disk state +// matches what the catalog produces (single fragment + inline indices when enabled). + +const SEED_LARGE_BATCH_SIZE: usize = 50_000; + +fn generate_manifest_batch(start_idx: usize, batch_size: usize, total_count: usize) -> RecordBatch { + let ns_count = total_count / 3; + let actual_size = batch_size.min(total_count - start_idx); + + let mut object_ids = Vec::with_capacity(actual_size); + let mut object_types = Vec::with_capacity(actual_size); + let mut locations: Vec> = Vec::with_capacity(actual_size); + let mut metadatas: Vec> = Vec::with_capacity(actual_size); + + for i in start_idx..start_idx + actual_size { + if i < ns_count { + object_ids.push(format!("ns_{}", i)); + object_types.push("namespace".to_string()); + locations.push(None); + metadatas.push(None); + } else { + let table_idx = i - ns_count; + object_ids.push(format!("table_{}", table_idx)); + object_types.push("table".to_string()); + locations.push(Some(format!("table_{}", table_idx))); + metadatas.push(Some(r#"{"bench":"true"}"#.to_string())); + } + } + + // base_objects is null for every bootstrapped row. + let mut base_objects_builder = ListBuilder::new(StringBuilder::new()) + .with_field(Arc::new(Field::new("object_id", DataType::Utf8, true))); + for _ in 0..actual_size { + base_objects_builder.append_null(); + } + + RecordBatch::try_new( + manifest_schema(), + vec![ + Arc::new(StringArray::from(object_ids)), + Arc::new(StringArray::from(object_types)), + Arc::new(StringArray::from( + locations.iter().map(|l| l.as_deref()).collect::>(), + )), + Arc::new(StringArray::from( + metadatas.iter().map(|m| m.as_deref()).collect::>(), + )), + Arc::new(base_objects_builder.finish()), + ], + ) + .expect("Failed to create manifest batch") +} + +async fn seed_large( + root: &str, + count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let manifest_uri = format!("{}/{}", root, "__manifest"); + eprintln!("Seed-large: writing {} rows to {}", count, manifest_uri); + + let schema = manifest_schema(); + let mut batches = Vec::new(); + let mut offset = 0; + while offset < count { + let batch_size = SEED_LARGE_BATCH_SIZE.min(count - offset); + batches.push(generate_manifest_batch(offset, batch_size, count)); + offset += batch_size; + } + eprintln!(" generated {} batches", batches.len()); + + let mut write_params = WriteParams { + mode: WriteMode::Create, + ..WriteParams::default() + }; + if !storage_options.is_empty() { + let accessor = Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + storage_options.clone(), + ), + ); + write_params.store_params = Some(lance_io::object_store::ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..Default::default() + }); + } + + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + InsertBuilder::new(manifest_uri.as_str()) + .with_params(&write_params) + .execute_stream(reader) + .await + .expect("Failed to write manifest dataset"); + eprintln!(" wrote Lance dataset"); + + // Trigger one CoW rewrite so the manifest is in steady catalog form (single + // fragment; inline indices when enabled). For the no-index variant the first real + // commit performs this rewrite instead. + if inline_optimization { + eprintln!(" triggering initial CoW rewrite to build indices..."); + let start = Instant::now(); + let ns = build_namespace(root, true, storage_options).await; + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec!["__seed_trigger__".to_string()]); + ns.create_namespace(req) + .await + .expect("Failed to trigger CoW rewrite"); + eprintln!( + " CoW rewrite with index build took {:.1}s", + start.elapsed().as_secs_f64() + ); + } + + let ns_count = count / 3; + eprintln!( + "Seed-large complete: {} rows ({} namespaces, {} tables)", + count, + ns_count, + count - ns_count + ); +} + +// ──────────────────── worker mode ──────────────────── + +#[allow(clippy::too_many_arguments)] +async fn worker( + root: &str, + operation: &str, + operations: usize, + duration_secs: u64, + warmup: usize, + worker_id: usize, + table_count: usize, + inline_optimization: bool, + storage_options: &HashMap, +) { + let ns = build_namespace(root, inline_optimization, storage_options).await; + let ipc_data = Bytes::from(create_test_ipc_data()); + + if operation.starts_with("warm-read") { + for _ in 0..warmup { + let _ = + run_operation(ns.as_ref(), operation, worker_id, 0, table_count, &ipc_data).await; + } + } + + let emit = |op_idx: usize, start: Instant, err: bool| { + let record = LatencyRecord { + operation: operation.to_string(), + latency_ms: start.elapsed().as_secs_f64() * 1000.0, + error: err, + }; + let _ = op_idx; + println!("{}", serde_json::to_string(&record).unwrap()); + }; + + if duration_secs > 0 { + // Steady-TPS mode: commit continuously until the deadline. + let deadline = Instant::now() + Duration::from_secs(duration_secs); + let mut op_idx = 0; + while Instant::now() < deadline { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + op_idx += 1; + } + } else { + for op_idx in 0..operations { + let start = Instant::now(); + let err = run_operation( + ns.as_ref(), + operation, + worker_id, + op_idx, + table_count, + &ipc_data, + ) + .await + .is_err(); + emit(op_idx, start, err); + } + } +} + +async fn run_operation( + ns: &dyn LanceNamespace, + operation: &str, + worker_id: usize, + op_idx: usize, + table_count: usize, + ipc_data: &Bytes, +) -> Result<(), Box> { + match operation { + "cold-read-list-namespaces" | "warm-read-list-namespaces" => { + let mut req = ListNamespacesRequest::new(); + req.id = Some(vec![]); + ns.list_namespaces(req).await?; + } + "cold-read-list-tables" | "warm-read-list-tables" => { + let mut req = ListTablesRequest::new(); + req.id = Some(vec![]); + ns.list_tables(req).await?; + } + "cold-read-describe-table" | "warm-read-describe-table" => { + let table_idx = (worker_id * 1_000_000 + op_idx) % table_count.max(1); + let req = DescribeTableRequest { + id: Some(vec![format!("table_{}", table_idx)]), + ..Default::default() + }; + ns.describe_table(req).await?; + } + "write-create-namespace" => { + let mut req = CreateNamespaceRequest::new(); + req.id = Some(vec![format!("bench_w{}_{}", worker_id, op_idx)]); + ns.create_namespace(req).await?; + } + "write-create-table" => { + let mut req = CreateTableRequest::new(); + req.id = Some(vec![format!("bench_t{}_{}", worker_id, op_idx)]); + ns.create_table(req, ipc_data.clone()).await?; + } + "write-declare-table" => { + let req = DeclareTableRequest { + id: Some(vec![format!("bench_d{}_{}", worker_id, op_idx)]), + ..Default::default() + }; + ns.declare_table(req).await?; + } + _ => { + return Err(format!("unknown operation: {}", operation).into()); + } + } + Ok(()) +} + +// ──────────────────── run mode (coordinator) ──────────────────── + +#[allow(clippy::too_many_arguments)] +fn run_workers( + self_exe: &str, + root: &str, + operation: &str, + concurrency: usize, + operations: usize, + duration_secs: u64, + warmup: usize, + table_count: usize, + initial_entries: usize, + inline_optimization: bool, + variant: &str, + storage_options: &HashMap, +) -> BenchResult { + // Continuous mode splits a fixed op budget across workers; steady-TPS mode lets each + // worker run for the full duration. + let ops_per_worker = if duration_secs > 0 { + 0 + } else { + operations / concurrency.max(1) + }; + if duration_secs == 0 && ops_per_worker == 0 { + return compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + Duration::ZERO, + vec![], + 0, + ); + } + + let wall_start = Instant::now(); + let children: Vec<_> = (0..concurrency) + .map(|worker_id| { + let mut cmd = Command::new(self_exe); + cmd.arg("worker") + .arg("--root") + .arg(root) + .arg("--operation") + .arg(operation) + .arg("--operations") + .arg(ops_per_worker.to_string()) + .arg("--duration-secs") + .arg(duration_secs.to_string()) + .arg("--warmup") + .arg(warmup.to_string()) + .arg("--worker-id") + .arg(worker_id.to_string()) + .arg("--table-count") + .arg(table_count.to_string()) + .arg("--inline-optimization") + .arg(inline_optimization.to_string()); + for (k, v) in storage_options { + cmd.arg("--storage-option").arg(format!("{}={}", k, v)); + } + cmd.stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .expect("Failed to spawn worker") + }) + .collect(); + + let mut all_latencies = Vec::new(); + let mut total_errors = 0; + for mut child in children { + let stdout = child.stdout.take().unwrap(); + for line in BufReader::new(stdout).lines() { + let line = line.expect("failed to read worker output"); + if let Ok(record) = serde_json::from_str::(&line) { + if record.error { + total_errors += 1; + } else { + all_latencies.push(record.latency_ms); + } + } + } + let status = child.wait().expect("failed to wait for worker"); + if !status.success() { + eprintln!("Worker exited with status: {}", status); + } + } + + compute_result( + variant, + operation, + concurrency, + initial_entries, + duration_secs, + wall_start.elapsed(), + all_latencies, + total_errors, + ) +} + +fn parse_concurrency_list(s: &str) -> Vec { + s.split(',') + .filter_map(|v| v.trim().parse::().ok()) + .filter(|v| *v > 0) + .collect() +} + +#[tokio::main] +async fn main() { + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + eprintln!("Usage: manifest_bench [options]"); + std::process::exit(1); + } + + let mode = args[1].as_str(); + let mut root = String::new(); + let mut operation = String::new(); + let mut operations: usize = 100; + let mut duration_secs: u64 = 0; + let mut warmup: usize = 0; + let mut concurrency_list = vec![1]; + let mut count: usize = 1000; + let mut worker_id: usize = 0; + let mut table_count: usize = 667; + let mut initial_entries: usize = 0; + let mut inline_optimization = true; + let mut variant = String::new(); + let mut storage_options: HashMap = HashMap::new(); + + let mut i = 2; + while i < args.len() { + match args[i].as_str() { + "--root" => { + root = args[i + 1].clone(); + i += 2; + } + "--operation" => { + operation = args[i + 1].clone(); + i += 2; + } + "--operations" => { + operations = args[i + 1].parse().unwrap(); + i += 2; + } + "--duration-secs" => { + duration_secs = args[i + 1].parse().unwrap(); + i += 2; + } + "--warmup" => { + warmup = args[i + 1].parse().unwrap(); + i += 2; + } + "--concurrency" => { + concurrency_list = parse_concurrency_list(&args[i + 1]); + i += 2; + } + "--count" => { + count = args[i + 1].parse().unwrap(); + i += 2; + } + "--worker-id" => { + worker_id = args[i + 1].parse().unwrap(); + i += 2; + } + "--table-count" => { + table_count = args[i + 1].parse().unwrap(); + i += 2; + } + "--initial-entries" => { + initial_entries = args[i + 1].parse().unwrap(); + i += 2; + } + "--inline-optimization" => { + inline_optimization = args[i + 1].parse().unwrap(); + i += 2; + } + "--variant" => { + variant = args[i + 1].clone(); + i += 2; + } + "--storage-option" => { + if let Some((k, v)) = args[i + 1].split_once('=') { + storage_options.insert(k.to_string(), v.to_string()); + } + i += 2; + } + other => { + eprintln!("Unknown argument: {}", other); + std::process::exit(1); + } + } + } + + if variant.is_empty() { + variant = if inline_optimization { + "inline_index".to_string() + } else { + "no_index".to_string() + }; + } + + match mode { + "seed-large" => { + seed_large(&root, count, inline_optimization, &storage_options).await; + } + "worker" => { + worker( + &root, + &operation, + operations, + duration_secs, + warmup, + worker_id, + table_count, + inline_optimization, + &storage_options, + ) + .await; + } + "run" => { + let self_exe = std::env::current_exe() + .expect("failed to get self exe path") + .to_string_lossy() + .to_string(); + let op = if operation.is_empty() { + "write-create-namespace" + } else { + operation.as_str() + }; + + eprintln!("=== Manifest commit benchmark ==="); + eprintln!( + "variant={} op={} root={} initial_entries={} concurrency={:?} operations={} duration_secs={}", + variant, op, root, initial_entries, concurrency_list, operations, duration_secs + ); + + for &concurrency in &concurrency_list { + let result = run_workers( + &self_exe, + &root, + op, + concurrency, + operations, + duration_secs, + warmup, + table_count, + initial_entries, + inline_optimization, + &variant, + &storage_options, + ); + eprintln!( + " c={} -> {:.2} ops/s ({} ops, {} errors, p50={:.0}ms p99={:.0}ms)", + concurrency, + result.throughput_ops_per_sec, + result.total_operations, + result.errors, + result.p50_latency_ms, + result.p99_latency_ms + ); + println!("{}", serde_json::to_string(&result).unwrap()); + } + eprintln!("=== complete ==="); + } + _ => { + eprintln!("Unknown mode: {}. Use seed-large, run, or worker.", mode); + std::process::exit(1); + } + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 8859e4bc237..6adc233d8a7 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -313,11 +313,10 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable inline optimization of the __manifest table. + /// Enable or disable replacement index maintenance for the __manifest table. /// - /// When enabled (default), performs compaction and indexing on the __manifest table - /// after every write operation to maintain optimal performance. - /// When disabled, manual optimization must be performed separately. + /// When enabled (default), copy-on-write manifest rewrites build replacement indices + /// for fast reads. When disabled, rewrites only replace data files. pub fn inline_optimization_enabled(mut self, enabled: bool) -> Self { self.inline_optimization_enabled = enabled; self @@ -355,7 +354,7 @@ impl DirectoryNamespaceBuilder { /// - `root`: The root directory path (required) /// - `manifest_enabled`: Enable manifest-based table tracking (optional, default: true) /// - `dir_listing_enabled`: Enable directory listing for table discovery (optional, default: true) - /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true) + /// - `inline_optimization_enabled`: Enable replacement indices on __manifest rewrites (optional, default: true) /// - `storage.*`: Storage options (optional, prefix will be stripped) /// /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped): @@ -2143,6 +2142,7 @@ impl DirectoryNamespace { /// to the manifest to enable manifest-only mode: /// /// ```no_run + /// #![recursion_limit = "256"] /// # use lance_namespace_impls::DirectoryNamespaceBuilder; /// # async fn example() -> Result<(), Box> { /// // Create namespace with dual mode (manifest + directory listing) @@ -3235,8 +3235,6 @@ impl LanceNamespace for DirectoryNamespace { ranges, }]; - let mut total_deleted_count = 0i64; - // Branches are not tracked in the manifest catalog, so a branch skips the // __manifest phase entirely and deletes its physical manifests directly. if branch.is_none() @@ -3260,32 +3258,30 @@ impl LanceNamespace for DirectoryNamespace { } // Phase 1 (atomic commit point): Delete version records from __manifest - // for ALL tables in a single atomic operation. This is the authoritative - // source of truth — once __manifest entries are removed, the versions - // are logically deleted across all tables atomically. - - // Collect all (table_id_str, ranges) for batch deletion - let mut all_object_ids: Vec = Vec::new(); - for te in &table_entries { - let table_id_str = manifest::ManifestNamespace::str_object_id( - &te.table_id.clone().unwrap_or_default(), - ); - for (start, end) in &te.ranges { - for version in *start..*end { - let object_id = manifest::ManifestNamespace::build_version_object_id( - &table_id_str, - version, - ); - all_object_ids.push(object_id); - } - } - } - - if !all_object_ids.is_empty() { - total_deleted_count = manifest_ns - .batch_delete_table_versions_by_object_ids(&all_object_ids) - .await?; - } + // for ALL tables in a single atomic copy-on-write rewrite. This is the + // authoritative source of truth — once __manifest entries are removed, + // the versions are logically deleted across all tables atomically. + // + // Request `ranges` carry an exclusive end (`[start, end)`); the manifest + // rewrite API matches an inclusive `[start, end]`, so shift the end down + // by one. Empty ranges collapse to start > end and are dropped downstream. + let table_ranges = table_entries + .iter() + .map(|te| { + let object_id = manifest::ManifestNamespace::str_object_id( + &te.table_id.clone().unwrap_or_default(), + ); + let inclusive_ranges = te + .ranges + .iter() + .map(|&(start, end)| (start, end - 1)) + .collect::>(); + (object_id, inclusive_ranges) + }) + .collect::>(); + let total_deleted_count = manifest_ns + .batch_delete_table_versions_by_ranges(&table_ranges) + .await?; // Phase 2: Delete physical manifest files (best-effort). // Even if some file deletions fail, the versions are already removed from @@ -3303,7 +3299,7 @@ impl LanceNamespace for DirectoryNamespace { // Direct path: delete physical files (no __manifest). Reached when storage // tracking is off, or for any branch (which has no __manifest entries). - total_deleted_count = self + let total_deleted_count = self .delete_physical_version_files(&table_entries, false, branch) .await?; diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 0e22f1e8b69..067239b8765 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -7,27 +7,37 @@ //! to track tables and nested namespaces. use arrow::array::builder::{ListBuilder, StringBuilder}; -use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray}; -use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use arrow::array::{Array, ListArray, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; use arrow_ipc::reader::StreamReader; use async_trait::async_trait; use bytes::Bytes; -use futures::{FutureExt, TryStreamExt, stream::StreamExt}; -use lance::dataset::optimize::{CompactionOptions, compact_files}; +use datafusion_common::DataFusionError; +use datafusion_physical_plan::{ + SendableRecordBatchStream, + stream::RecordBatchStreamAdapter as DatafusionRecordBatchStreamAdapter, +}; +use futures::{ + FutureExt, TryStreamExt, + stream::{self, StreamExt}, +}; +use lance::dataset::index::LanceIndexStoreExt; +use lance::dataset::transaction::{Operation, Transaction}; use lance::dataset::{ - DeleteBuilder, MergeInsertBuilder, ReadParams, WhenMatched, WhenNotMatched, WriteMode, - WriteParams, builder::DatasetBuilder, + InsertBuilder, ReadParams, WhenMatched, WriteMode, WriteParams, builder::DatasetBuilder, }; -use lance::index::DatasetIndexExt; use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; use lance_core::Error as LanceError; use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; -use lance_core::{Error, Result}; -use lance_index::IndexType; -use lance_index::optimize::OptimizeOptions; -use lance_index::scalar::{BuiltinIndexType, ScalarIndexParams}; +use lance_core::{Error, ROW_ID, Result}; +use lance_index::progress::noop_progress; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::registry::VALUE_COLUMN_NAME; +use lance_index::scalar::{BuiltinIndexType, CreatedIndex, ScalarIndexParams}; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; +use lance_io::stream::RecordBatchStream as LanceRecordBatchStream; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::models::{ @@ -41,17 +51,27 @@ use lance_namespace::models::{ TableVersion, }; use lance_namespace::schema::arrow_schema_to_json; +use lance_table::feature_flags::apply_feature_flags; +use lance_table::format::{Fragment, IndexMetadata, Manifest}; +use lance_table::io::commit::{ + CommitError, CommitHandler, commit_handler_from_url, write_manifest_file_to_path, +}; use object_store::{Error as ObjectStoreError, path::Path}; +use roaring::RoaringBitmap; use std::io::Cursor; +use std::time::{SystemTime, UNIX_EPOCH}; use std::{ - collections::HashMap, + collections::{BTreeMap, HashMap, HashSet}, hash::{DefaultHasher, Hash, Hasher}, ops::{Deref, DerefMut}, - sync::Arc, + sync::{Arc, Mutex as StdMutex, MutexGuard as StdMutexGuard}, }; use tokio::sync::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use uuid::Uuid; const MANIFEST_TABLE_NAME: &str = "__manifest"; +const LANCE_DATA_DIR: &str = "data"; +const LANCE_INDICES_DIR: &str = "_indices"; const DELIMITER: &str = "$"; /// Bounded concurrency for per-table `_versions/` probes when filtering declared tables. /// Higher values reduce latency but increase burst load against the object store. @@ -64,9 +84,10 @@ const OBJECT_ID_INDEX_NAME: &str = "object_id_btree"; const OBJECT_TYPE_INDEX_NAME: &str = "object_type_bitmap"; /// LabelList index on the base_objects column for view dependencies const BASE_OBJECTS_INDEX_NAME: &str = "base_objects_label_list"; -/// Inline maintenance on the manifest table is expensive relative to a single-row mutation. -/// Wait until enough fragments accumulate before compacting files or merging indices. -const MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD: usize = 8; +// Each retry reloads and rewrites the full manifest. Match the regular Lance +// commit retry budget so multi-process namespace writes can make progress. +const DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES: u32 = 20; +const MANIFEST_INDEX_BATCH_SIZE: usize = 8192; /// Object types that can be stored in the manifest #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -77,7 +98,7 @@ pub enum ObjectType { } impl ObjectType { - pub fn as_str(&self) -> &str { + pub fn as_str(&self) -> &'static str { match self { Self::Namespace => "namespace", Self::Table => "table", @@ -160,6 +181,484 @@ pub struct ManifestEntry { pub metadata: Option, } +struct CopyOnWriteMutation { + result: T, + has_changes: bool, +} + +impl CopyOnWriteMutation { + fn updated(result: T) -> Self { + Self { + result, + has_changes: true, + } + } + + fn unchanged(result: T) -> Self { + Self { + result, + has_changes: false, + } + } +} + +struct ManifestIndexBuildInput { + index_name: &'static str, + column_name: &'static str, + params: ScalarIndexParams, + field: Field, + stream: SendableRecordBatchStream, +} + +struct ManifestTrainedIndex { + index_name: &'static str, + column_name: &'static str, + uuid: Uuid, + created_index: CreatedIndex, +} + +struct ManifestRowValue { + object_id: String, + object_type: ObjectType, + location: Option, + metadata: Option, + base_objects: Option>, +} + +struct ManifestOutputRow<'a> { + object_id: &'a str, + object_type: ObjectType, + location: Option<&'a str>, + metadata: Option<&'a str>, + base_objects: Option<&'a [String]>, +} + +#[derive(Default)] +struct ManifestIndexAccumulator { + object_ids: BTreeMap, u64>, + object_types: BTreeMap<&'static str, RoaringBitmap>, + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + row_count: u64, +} + +impl ManifestIndexAccumulator { + fn next_row_id(&self) -> Result { + if self.row_count >= u64::from(u32::MAX) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest rewrite exceeded maximum single-fragment row count: {}", + self.row_count + ), + } + .into()); + } + Ok(self.row_count) + } + + fn push(&mut self, row: &ManifestOutputRow<'_>) -> Result { + let row_id = self.next_row_id()?; + if self + .object_ids + .insert(Arc::::from(row.object_id), row_id) + .is_some() + { + return Err(NamespaceError::Internal { + message: format!("Manifest contains duplicate object_id '{}'", row.object_id), + } + .into()); + } + self.object_types + .entry(row.object_type.as_str()) + .or_default() + .insert(row_id as u32); + self.base_objects_values + .push(row.base_objects.map(|objects| objects.to_vec())); + self.base_objects_row_ids.push(row_id); + self.row_count += 1; + Ok(row_id) + } +} + +struct ManifestBatchBuilder { + object_ids: Vec, + object_types: Vec<&'static str>, + locations: Vec>, + metadatas: Vec>, + base_objects: Vec>>, +} + +impl ManifestBatchBuilder { + fn new() -> Self { + Self { + object_ids: Vec::new(), + object_types: Vec::new(), + locations: Vec::new(), + metadatas: Vec::new(), + base_objects: Vec::new(), + } + } + + fn is_empty(&self) -> bool { + self.object_ids.is_empty() + } + + fn append( + &mut self, + index_data: &mut ManifestIndexAccumulator, + row: ManifestOutputRow<'_>, + ) -> Result<()> { + index_data.push(&row)?; + self.object_ids.push(row.object_id.to_string()); + self.object_types.push(row.object_type.as_str()); + self.locations.push(row.location.map(ToString::to_string)); + self.metadatas.push(row.metadata.map(ToString::to_string)); + self.base_objects + .push(row.base_objects.map(|objects| objects.to_vec())); + Ok(()) + } + + fn finish(self) -> Result { + let base_objects_array = ManifestNamespace::base_objects_array(&self.base_objects); + RecordBatch::try_new( + ManifestNamespace::manifest_schema(), + vec![ + Arc::new(StringArray::from(self.object_ids)), + Arc::new(StringArray::from(self.object_types)), + Arc::new(StringArray::from(self.locations)), + Arc::new(StringArray::from(self.metadatas)), + Arc::new(base_objects_array), + ], + ) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to create manifest snapshot batch: {:?}", e), + }) + }) + } +} + +/// How to resolve a storage commit conflict (or an ambiguous commit error that did +/// not land) against the latest catalog state, without re-staging the full rewrite. +enum ConflictResolution { + /// Re-read the latest manifest and re-apply the mutation (upserts, version-range + /// deletes). The staged data/index files are discarded and a new rewrite is attempted. + Retry, + /// Creating these object ids with fail-on-conflict semantics. If any of them now + /// exists in the latest manifest, the create lost the race and must fail with a + /// concurrent-modification error; otherwise retry the rewrite. + FailIfExists(Vec), + /// Deleting `object_id`. If it is already absent from the latest manifest the delete + /// has effectively happened, so return `output` as success; otherwise retry. + SucceedIfAbsent { object_id: String, output: O }, +} + +trait ManifestStreamMutation: Send { + type Output: Clone + Send + 'static; + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()>; + + fn finish(&self) -> CopyOnWriteMutation; + + /// Declares how a storage commit conflict should be resolved against the latest + /// committed catalog state. Defaults to re-reading and re-applying. + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::Retry + } +} + +struct ManifestRewriteShared { + mutation: M, + index_data: Option, + result: Option>, + error: Option, +} + +impl ManifestRewriteShared { + fn new(mutation: M) -> Self { + Self { + mutation, + index_data: Some(ManifestIndexAccumulator::default()), + result: None, + error: None, + } + } +} + +struct UpsertManifestMutation { + entries: Vec, + base_objects: Vec>>, + entry_positions: HashMap, + matched: Vec, + when_matched: WhenMatched, +} + +impl UpsertManifestMutation { + fn new( + entries: Vec, + base_objects: Option>, + when_matched: WhenMatched, + ) -> Self { + let entry_positions = entries + .iter() + .enumerate() + .map(|(index, entry)| (entry.object_id.clone(), index)) + .collect(); + let matched = vec![false; entries.len()]; + let mut entry_base_objects = vec![None; entries.len()]; + if !entry_base_objects.is_empty() { + entry_base_objects[0] = base_objects; + } + Self { + entries, + base_objects: entry_base_objects, + entry_positions, + matched, + when_matched, + } + } + + fn entry_row(&self, index: usize) -> ManifestOutputRow<'_> { + let entry = &self.entries[index]; + ManifestOutputRow { + object_id: &entry.object_id, + object_type: entry.object_type, + location: entry.location.as_deref(), + metadata: entry.metadata.as_deref(), + base_objects: self.base_objects[index].as_deref(), + } + } +} + +impl ManifestStreamMutation for UpsertManifestMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if let Some(index) = self.entry_positions.get(&row.object_id).copied() { + match self.when_matched { + WhenMatched::Fail => { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + row.object_id + ), + } + .into()); + } + WhenMatched::UpdateAll => { + self.matched[index] = true; + output.append(index_data, self.entry_row(index))?; + return Ok(()); + } + _ => { + return Err(NamespaceError::Internal { + message: format!( + "Unsupported manifest rewrite matched action: {:?}", + self.when_matched + ), + } + .into()); + } + } + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + for index in 0..self.entries.len() { + if !self.matched[index] { + output.append(index_data, self.entry_row(index))?; + } + } + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + CopyOnWriteMutation::updated(()) + } + + fn conflict_resolution(&self) -> ConflictResolution { + match self.when_matched { + // Fail-on-conflict create: a concurrent writer may have created one of these + // ids. Re-applying would still fail, so check directly instead of re-staging. + WhenMatched::Fail => ConflictResolution::FailIfExists( + self.entries.iter().map(|e| e.object_id.clone()).collect(), + ), + // Metadata upsert is last-writer-wins: re-read and re-apply. + _ => ConflictResolution::Retry, + } + } +} + +struct DeleteObjectMutation { + object_id: String, + deleted: bool, +} + +impl ManifestStreamMutation for DeleteObjectMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if row.object_id == self.object_id { + self.deleted = true; + return Ok(()); + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + _output: &mut ManifestBatchBuilder, + _index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + if self.deleted { + CopyOnWriteMutation::updated(()) + } else { + CopyOnWriteMutation::unchanged(()) + } + } + + fn conflict_resolution(&self) -> ConflictResolution { + // If a concurrent writer already removed the object, the delete is satisfied. + ConflictResolution::SucceedIfAbsent { + object_id: self.object_id.clone(), + output: (), + } + } +} + +enum DeleteTableVersionsTarget { + ObjectIds(HashSet), + Ranges(Vec), +} + +#[derive(Clone)] +struct DeleteTableVersionRangeTarget { + object_id_prefix: String, + ranges: Vec<(i64, i64)>, +} + +impl DeleteTableVersionRangeTarget { + fn matches(&self, object_id: &str) -> bool { + let Some(version) = object_id + .strip_prefix(&self.object_id_prefix) + .and_then(|suffix| suffix.parse::().ok()) + else { + return false; + }; + + self.ranges + .iter() + .any(|(start, end)| *start <= version && version <= *end) + } +} + +impl DeleteTableVersionsTarget { + fn matches(&self, object_id: &str) -> bool { + match self { + Self::ObjectIds(object_ids) => object_ids.contains(object_id), + Self::Ranges(targets) => targets.iter().any(|target| target.matches(object_id)), + } + } +} + +struct DeleteTableVersionsMutation { + target: DeleteTableVersionsTarget, + deleted_count: i64, +} + +impl ManifestStreamMutation for DeleteTableVersionsMutation { + type Output = i64; + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + if row.object_type == ObjectType::TableVersion && self.target.matches(&row.object_id) { + self.deleted_count += 1; + return Ok(()); + } + + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + _output: &mut ManifestBatchBuilder, + _index_data: &mut ManifestIndexAccumulator, + ) -> Result<()> { + Ok(()) + } + + fn finish(&self) -> CopyOnWriteMutation { + if self.deleted_count > 0 { + CopyOnWriteMutation::updated(self.deleted_count) + } else { + CopyOnWriteMutation::unchanged(0) + } + } +} + /// Information about a namespace stored in the manifest #[derive(Debug, Clone)] pub struct NamespaceInfo { @@ -190,6 +689,14 @@ impl DatasetConsistencyWrapper { }) } + /// Reload the dataset and return a reference. + pub async fn get_refreshed(&self) -> Result> { + self.reload().await?; + Ok(DatasetReadGuard { + guard: self.0.read().await, + }) + } + /// Get a mutable reference to the dataset. /// Always reloads to ensure strong consistency. pub async fn get_mut(&self) -> Result> { @@ -306,8 +813,8 @@ pub struct ManifestNamespace { /// If true, root namespace tables use {table_name}.lance naming /// If false, they use namespace-prefixed names dir_listing_enabled: bool, - /// Whether to perform inline optimization (compaction and indexing) on the __manifest table - /// after every write. Defaults to true. + /// Whether copy-on-write manifest rewrites should build replacement indices. + /// Defaults to true. inline_optimization_enabled: bool, /// Number of retries for commit operations on the manifest table. /// If None, defaults to [`lance_table::io::commit::CommitConfig`] default (20). @@ -493,6 +1000,32 @@ impl ManifestNamespace { ) } + fn build_version_object_id_prefix(table_object_id: &str) -> String { + format!("{}{}", table_object_id, DELIMITER) + } + + fn normalize_table_version_ranges(ranges: &[(i64, i64)]) -> Vec<(i64, i64)> { + let mut normalized = ranges + .iter() + .filter_map(|(start, end)| (*start <= *end).then_some((*start, *end))) + .collect::>(); + normalized.sort_unstable(); + + let mut merged: Vec<(i64, i64)> = Vec::with_capacity(normalized.len()); + for (start, end) in normalized { + let Some((_last_start, last_end)) = merged.last_mut() else { + merged.push((start, end)); + continue; + }; + if start <= *last_end + 1 { + *last_end = (*last_end).max(end); + continue; + } + merged.push((start, end)); + } + merged + } + /// Parse a version number from the version suffix of a table version object_id. /// /// The object_id is formatted as `{table_id}${zero_padded_version}`. @@ -556,165 +1089,389 @@ impl ManifestNamespace { Ok(full_url.to_string()) } - /// Perform inline optimization on the __manifest table. - /// - /// This method: - /// 1. Creates three indexes on the manifest table: - /// - BTREE index on object_id for fast lookups - /// - Bitmap index on object_type for filtering by type - /// - LabelList index on base_objects for view dependencies - /// 2. Runs file compaction to merge small files - /// 3. Optimizes existing indices - /// - /// This is called automatically after writes when inline_optimization_enabled is true. - async fn run_inline_optimization(&self) -> Result<()> { - if !self.inline_optimization_enabled { - return Ok(()); + fn string_list_array(values: &[Option>], child_name: &str) -> ListArray { + let string_builder = StringBuilder::new(); + let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( + child_name, + DataType::Utf8, + true, + ))); + for value in values { + match value { + Some(objects) => { + for object in objects { + list_builder.values().append_value(object); + } + list_builder.append(true); + } + None => list_builder.append_null(), + } } + list_builder.finish() + } - // Get a mutable reference to the dataset to perform optimization - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - let dataset: &mut Dataset = &mut dataset_guard; + fn base_objects_array(values: &[Option>]) -> ListArray { + Self::string_list_array(values, "object_id") + } - // Step 1: Create indexes if they don't already exist - let indices = dataset.load_indices().await?; + fn value_row_id_schema(value_field: Field) -> SchemaRef { + Arc::new(ArrowSchema::new(vec![ + value_field, + Field::new(ROW_ID, DataType::UInt64, false), + ])) + } - // Check which indexes already exist - let has_object_id_index = indices.iter().any(|idx| idx.name == OBJECT_ID_INDEX_NAME); - let has_object_type_index = indices.iter().any(|idx| idx.name == OBJECT_TYPE_INDEX_NAME); - let has_base_objects_index = indices - .iter() - .any(|idx| idx.name == BASE_OBJECTS_INDEX_NAME); + fn string_row_id_batch( + schema: SchemaRef, + values: Vec, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(values)), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - // Create BTREE index on object_id - if !has_object_id_index { - log::debug!( - "Creating BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); - let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); - if let Err(e) = dataset - .create_index( - &["object_id"], - IndexType::BTree, - Some(OBJECT_ID_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create BTREE index on object_id for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created BTREE index '{}' on object_id for __manifest table", - OBJECT_ID_INDEX_NAME - ); - } - } + fn list_row_id_batch( + schema: SchemaRef, + values: Vec>>, + row_ids: Vec, + ) -> Result { + RecordBatch::try_new( + schema, + vec![ + Arc::new(Self::string_list_array(&values, "item")), + Arc::new(UInt64Array::from(row_ids)), + ], + ) + .map_err(Into::into) + } - // Create Bitmap index on object_type - if !has_object_type_index { - log::debug!( - "Creating Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["object_type"], - IndexType::Bitmap, - Some(OBJECT_TYPE_INDEX_NAME.to_string()), - ¶ms, - true, - ) - .await - { - log::warn!( - "Failed to create Bitmap index on object_type for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created Bitmap index '{}' on object_type for __manifest table", - OBJECT_TYPE_INDEX_NAME - ); - } - } + fn object_id_index_stream(object_ids: BTreeMap, u64>) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let stream = stream::unfold( + (object_ids.into_iter(), false, schema), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value.to_string()); + row_ids.push(row_id); + } + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } - // Create LabelList index on base_objects - if !has_base_objects_index { - log::debug!( - "Creating LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - let params = ScalarIndexParams::default(); - if let Err(e) = dataset - .create_index( - &["base_objects"], - IndexType::LabelList, - Some(BASE_OBJECTS_INDEX_NAME.to_string()), - ¶ms, - true, + fn object_type_index_stream( + object_types: BTreeMap<&'static str, RoaringBitmap>, + ) -> SendableRecordBatchStream { + let schema = + Self::value_row_id_schema(Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false)); + let stream_schema = schema.clone(); + let entries = object_types + .into_iter() + .map(|(value, bitmap)| { + ( + value, + Box::new(bitmap.into_iter()) as Box + Send>, ) - .await - { - log::warn!( - "Failed to create LabelList index on base_objects for __manifest table: {:?}. Query performance may be impacted.", - e - ); - } else { - log::info!( - "Created LabelList index '{}' on base_objects for __manifest table", - BASE_OBJECTS_INDEX_NAME - ); - } - } + }) + .collect::>() + .into_iter(); + let stream = stream::unfold( + (entries, None, false, schema), + |(mut entries, mut current, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + while values.len() < MANIFEST_INDEX_BATCH_SIZE { + if current.is_none() { + current = entries.next(); + } + let Some((value, iter)) = current.as_mut() else { + break; + }; + if let Some(row_id) = iter.next() { + values.push((*value).to_string()); + row_ids.push(u64::from(row_id)); + } else { + current = None; + } + } - let should_compact_and_optimize = - dataset.count_fragments() >= MANIFEST_INLINE_OPTIMIZATION_FRAGMENT_THRESHOLD; + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + } else { + let batch = Self::string_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (entries, current, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } - if !should_compact_and_optimize { - return Ok(()); + fn base_objects_index_stream( + base_objects_values: Vec>>, + base_objects_row_ids: Vec, + ) -> SendableRecordBatchStream { + let schema = Self::value_row_id_schema(Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + )); + let stream_schema = schema.clone(); + let stream = stream::unfold( + ( + base_objects_values.into_iter().zip(base_objects_row_ids), + false, + schema, + ), + |(mut iter, emitted, schema)| async move { + let mut values = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + let mut row_ids = Vec::with_capacity(MANIFEST_INDEX_BATCH_SIZE); + for _ in 0..MANIFEST_INDEX_BATCH_SIZE { + let Some((value, row_id)) = iter.next() else { + break; + }; + values.push(value); + row_ids.push(row_id); + } + if values.is_empty() { + if emitted { + None + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + } else { + let batch = Self::list_row_id_batch(schema.clone(), values, row_ids) + .map_err(|err| DataFusionError::External(Box::new(err))); + Some((batch, (iter, true, schema))) + } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + stream_schema, + stream.fuse(), + )) + } + + async fn train_manifest_index( + dataset: &Dataset, + registry: Arc, + input: ManifestIndexBuildInput, + index_uuid: Uuid, + ) -> Result { + let index_store = LanceIndexStore::from_dataset_for_new(dataset, &index_uuid)?; + let plugin = registry.get_plugin_by_name(&input.params.index_type)?; + let training_request = plugin + .new_training_request(input.params.params.as_deref().unwrap_or("{}"), &input.field)?; + let created_index = plugin + .train_index( + input.stream, + &index_store, + training_request, + None, + noop_progress(), + ) + .await?; + Ok(ManifestTrainedIndex { + index_name: input.index_name, + column_name: input.column_name, + uuid: index_uuid, + created_index, + }) + } + + fn manifest_index_metadata( + lance_schema: &lance_core::datatypes::Schema, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + trained_index: ManifestTrainedIndex, + ) -> Result { + Ok(IndexMetadata { + uuid: trained_index.uuid, + fields: vec![lance_schema.field_id(trained_index.column_name)?], + name: trained_index.index_name.to_string(), + dataset_version, + fragment_bitmap: Some(fragment_bitmap.clone()), + index_details: Some(Arc::new(trained_index.created_index.index_details)), + index_version: trained_index.created_index.index_version as i32, + created_at: None, + base_id: None, + files: Some(trained_index.created_index.files), + }) + } + + fn manifest_fragment_bitmap(manifest: &Manifest) -> Result { + let mut bitmap = RoaringBitmap::new(); + for fragment in manifest.fragments.iter() { + let fragment_id = u32::try_from(fragment.id).map_err(|_| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Manifest fragment id {} exceeds u32", fragment.id), + }) + })?; + bitmap.insert(fragment_id); } + Ok(bitmap) + } - // Step 2: Run file compaction - log::debug!("Running file compaction on __manifest table"); - match compact_files(dataset, CompactionOptions::default(), None).await { - Ok(compaction_metrics) => { - if compaction_metrics.fragments_removed > 0 { - log::info!( - "Compacted __manifest table: removed {} fragments, added {} fragments", - compaction_metrics.fragments_removed, - compaction_metrics.fragments_added - ); + fn manifest_from_overwrite_transaction( + previous: &Manifest, + schema: lance_core::datatypes::Schema, + fragments: &[Fragment], + ) -> Manifest { + let mut next_fragment_id = 0; + let mut fragments = fragments + .iter() + .cloned() + .map(|mut fragment| { + if fragment.id == 0 { + fragment.id = next_fragment_id; + next_fragment_id += 1; } - } - Err(e) => { - log::warn!( - "Failed to compact files for __manifest table: {:?}. Continuing with optimization.", - e - ); - } - } + fragment + }) + .collect::>(); + fragments.sort_by_key(|fragment| fragment.id); + Manifest::new_from_previous(previous, schema, Arc::new(fragments)) + } - // Step 3: Optimize indices - log::debug!("Optimizing indices on __manifest table"); - match dataset.optimize_indices(&OptimizeOptions::default()).await { - Ok(_) => { - log::info!("Successfully optimized indices on __manifest table"); - } - Err(e) => { - log::warn!( - "Failed to optimize indices on __manifest table: {:?}. Continuing anyway.", - e - ); - } - } + async fn build_manifest_indices( + dataset: &Dataset, + manifest: &Manifest, + index_data: ManifestIndexAccumulator, + index_uuids: [Uuid; 3], + ) -> Result> { + let fragment_bitmap = Self::manifest_fragment_bitmap(manifest)?; + let schema = &manifest.schema; + let ManifestIndexAccumulator { + object_ids, + object_types, + base_objects_values, + base_objects_row_ids, + .. + } = index_data; + let [object_id_uuid, object_type_uuid, base_objects_uuid] = index_uuids; + let registry = IndexPluginRegistry::with_default_plugins(); + + let dataset_version = manifest.version; + let object_id_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_ID_INDEX_NAME, + column_name: "object_id", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::BTree), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_id_index_stream(object_ids), + }, + &fragment_bitmap, + dataset_version, + object_id_uuid, + ); + let object_type_index_fut = Self::build_manifest_index( + dataset, + registry.clone(), + schema, + ManifestIndexBuildInput { + index_name: OBJECT_TYPE_INDEX_NAME, + column_name: "object_type", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap), + field: Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + stream: Self::object_type_index_stream(object_types), + }, + &fragment_bitmap, + dataset_version, + object_type_uuid, + ); + let base_objects_index_fut = Self::build_manifest_index( + dataset, + registry, + schema, + ManifestIndexBuildInput { + index_name: BASE_OBJECTS_INDEX_NAME, + column_name: "base_objects", + params: ScalarIndexParams::for_builtin(BuiltinIndexType::LabelList), + field: Field::new( + VALUE_COLUMN_NAME, + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + stream: Self::base_objects_index_stream(base_objects_values, base_objects_row_ids), + }, + &fragment_bitmap, + dataset_version, + base_objects_uuid, + ); - Ok(()) + let (object_id_index, object_type_index, base_objects_index) = futures::join!( + object_id_index_fut, + object_type_index_fut, + base_objects_index_fut + ); + + Ok(vec![ + object_id_index?, + object_type_index?, + base_objects_index?, + ]) + } + + async fn build_manifest_index( + dataset: &Dataset, + registry: Arc, + lance_schema: &lance_core::datatypes::Schema, + input: ManifestIndexBuildInput, + fragment_bitmap: &RoaringBitmap, + dataset_version: u64, + index_uuid: Uuid, + ) -> Result { + let trained_index = + Self::train_manifest_index(dataset, registry, input, index_uuid).await?; + Self::manifest_index_metadata( + lance_schema, + fragment_bitmap, + dataset_version, + trained_index, + ) } /// Get the manifest schema @@ -783,147 +1540,755 @@ impl ManifestNamespace { }) } - /// Check if the manifest contains an object with the given ID - async fn manifest_contains_object(&self, object_id: &str) -> Result { - let escaped_id = object_id.replace('\'', "''"); - let filter = format!("object_id = '{}'", escaped_id); + fn required_string_value<'a>( + array: &'a StringArray, + row: usize, + column_name: &str, + ) -> Result<&'a str> { + if array.is_null(row) { + return Err(NamespaceError::Internal { + message: format!("Manifest column '{}' has null at row {}", column_name, row), + } + .into()); + } + Ok(array.value(row)) + } - let dataset_guard = self.manifest_dataset.get().await?; - let mut scanner = dataset_guard.scan(); + fn optional_string_value(array: &StringArray, row: usize) -> Option { + (!array.is_null(row)).then(|| array.value(row).to_string()) + } - scanner.filter(&filter).map_err(|e| { + fn base_objects_column_values(batch: &RecordBatch) -> Result>>> { + let Some(column) = batch.column_by_name("base_objects") else { + return Ok(vec![None; batch.num_rows()]); + }; + let array = column.as_any().downcast_ref::().ok_or_else(|| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), + message: format!( + "Column 'base_objects' is not a list array: {:?}", + column.data_type() + ), }) })?; - // Project no columns and enable row IDs for count_rows to work - scanner.project::<&str>(&[]).map_err(|e| { + let mut values = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + if array.is_null(row) { + values.push(None); + continue; + } + let row_values = array.value(row); + let row_values = row_values + .as_any() + .downcast_ref::() + .ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Column 'base_objects' values are not strings".to_string(), + }) + })?; + let mut objects = Vec::with_capacity(row_values.len()); + for value_index in 0..row_values.len() { + if row_values.is_null(value_index) { + return Err(NamespaceError::Internal { + message: format!( + "Manifest column 'base_objects' has null item at row {} item {}", + row, value_index + ), + } + .into()); + } + objects.push(row_values.value(value_index).to_string()); + } + values.push(Some(objects)); + } + Ok(values) + } + + async fn manifest_projected_stream(dataset: &Dataset) -> Result { + let mut scanner = dataset.scan(); + scanner + .project(&[ + "object_id", + "object_type", + "location", + "metadata", + "base_objects", + ]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project manifest columns: {:?}", e), + }) + })?; + let stream = scanner.try_into_stream().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), + message: format!("Failed to create manifest stream: {:?}", e), }) })?; + let schema = stream.schema(); + let stream = stream.map_err(|err| DataFusionError::External(Box::new(err))); + Ok(Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + ))) + } - scanner.with_row_id(); + fn manifest_rewrite_commit_retries(&self) -> u32 { + self.commit_retries + .unwrap_or(DEFAULT_MANIFEST_REWRITE_COMMIT_RETRIES) + } - let count = scanner.count_rows().await.map_err(|e| { + fn lock_manifest_rewrite_shared( + shared: &Arc>>, + ) -> Result>> { + shared.lock().map_err(|_| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to count rows: {:?}", e), + message: "Manifest rewrite state mutex was poisoned".to_string(), }) - })?; + }) + } - Ok(count > 0) + fn set_manifest_rewrite_error( + shared: &Arc>>, + err: LanceError, + ) { + match shared.lock() { + Ok(mut guard) => { + guard.error = Some(err); + } + Err(poisoned) => { + let mut guard = poisoned.into_inner(); + guard.error = Some(err); + } + } } - /// Query the manifest for a table with the given object ID - async fn query_manifest_for_table(&self, object_id: &str) -> Result> { - let escaped_id = object_id.replace('\'', "''"); - let filter = format!("object_id = '{}' AND object_type = 'table'", escaped_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { + fn take_manifest_rewrite_error( + shared: &Arc>>, + ) -> Result> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + Ok(guard.error.take()) + } + + fn process_manifest_rewrite_batch( + batch: RecordBatch, + shared: &Arc>>, + ) -> Result> { + let object_ids = Self::get_string_column(&batch, "object_id")?; + let object_types = Self::get_string_column(&batch, "object_type")?; + let locations = Self::get_string_column(&batch, "location")?; + let metadatas = Self::get_string_column(&batch, "metadata")?; + let base_objects = Self::base_objects_column_values(&batch)?; + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), + message: "Manifest rewrite index state is unavailable".to_string(), }) })?; - scanner - .project(&["object_id", "location", "metadata"]) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; + for (row, base_objects) in base_objects.into_iter().enumerate().take(batch.num_rows()) { + let row_value = ManifestRowValue { + object_id: Self::required_string_value(object_ids, row, "object_id")?.to_string(), + object_type: ObjectType::parse(Self::required_string_value( + object_types, + row, + "object_type", + )?)?, + location: Self::optional_string_value(locations, row), + metadata: Self::optional_string_value(metadatas, row), + base_objects, + }; + guard + .mutation + .process_existing_row(row_value, &mut output, &mut index_data)?; + } + guard.index_data = Some(index_data); + if output.is_empty() { + return Ok(None); + } + Ok(Some(output.finish()?)) + } - for batch in batches { - if batch.num_rows() == 0 { - continue; - } + fn finish_manifest_rewrite_stream( + shared: &Arc>>, + ) -> Result> { + let mut output = ManifestBatchBuilder::new(); + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let mut index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + guard.mutation.append_rows(&mut output, &mut index_data)?; + let result = guard.mutation.finish(); + let force_empty_batch = index_data.row_count == 0; + guard.result = Some(result); + guard.index_data = Some(index_data); + if output.is_empty() && !force_empty_batch { + Ok(None) + } else { + Ok(Some(output.finish()?)) + } + } - total_rows += batch.num_rows(); - if total_rows > 1 { - return Err(NamespaceError::Internal { - message: format!( - "Expected exactly 1 table with id '{}', found {}", - object_id, total_rows - ), - } - .into()); - } + fn manifest_rewrite_output_stream( + source: SendableRecordBatchStream, + shared: Arc>>, + ) -> SendableRecordBatchStream { + enum Phase { + Source, + Finish, + Done, + } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let location_array = Self::get_string_column(&batch, "location")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - let location = location_array.value(0).to_string(); - let metadata = if !metadata_array.is_null(0) { - let metadata_str = metadata_array.value(0); - match serde_json::from_str::>(metadata_str) { - Ok(map) => Some(map), - Err(e) => { - return Err(NamespaceError::Internal { - message: format!( - "Failed to deserialize metadata for table '{}': {}", - object_id, e - ), + let schema = Self::manifest_schema(); + let stream = stream::unfold( + (source, shared, Phase::Source), + |(mut source, shared, mut phase)| async move { + loop { + match phase { + Phase::Source => match source.next().await { + Some(Ok(batch)) => { + match Self::process_manifest_rewrite_batch(batch, &shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } + } + Some(Err(err)) => { + return Some((Err(err), (source, shared, Phase::Done))); + } + None => phase = Phase::Finish, + }, + Phase::Finish => { + phase = Phase::Done; + match Self::finish_manifest_rewrite_stream(&shared) { + Ok(Some(batch)) => { + return Some((Ok(batch), (source, shared, phase))); + } + Ok(None) => continue, + Err(err) => { + let message = err.to_string(); + Self::set_manifest_rewrite_error(&shared, err); + return Some(( + Err(DataFusionError::External(Box::new( + std::io::Error::other(message), + ))), + (source, shared, Phase::Done), + )); + } + } } - .into()); + Phase::Done => return None, } } - } else { - None - }; - let (namespace, name) = Self::parse_object_id(object_id_array.value(0)); - found_result = Some(TableInfo { - namespace, - name, - location, - metadata, - }); - } + }, + ); + Box::pin(DatafusionRecordBatchStreamAdapter::new( + schema, + stream.fuse(), + )) + } - Ok(found_result) + fn take_manifest_rewrite_result( + shared: &Arc>>, + ) -> Result<(CopyOnWriteMutation, ManifestIndexAccumulator)> { + let mut guard = Self::lock_manifest_rewrite_shared(shared)?; + let result = guard.result.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite stream did not finish".to_string(), + }) + })?; + let index_data = guard.index_data.take().ok_or_else(|| { + lance_core::Error::from(NamespaceError::Internal { + message: "Manifest rewrite index state is unavailable".to_string(), + }) + })?; + Ok((result, index_data)) } - fn serialize_metadata( - properties: Option<&HashMap>, - object_type: &str, - object_id: &str, - ) -> Result> { - match properties { - Some(properties) if !properties.is_empty() => { - serde_json::to_string(properties).map(Some).map_err(|e| { - LanceError::from(NamespaceError::Internal { - message: format!( - "Failed to serialize {} metadata for '{}': {}", - object_type, object_id, e - ), - }) - }) + /// Delete the staged (uncommitted) data files and index directories for a rewrite. + /// Only call this once the rewrite is known *not* to have landed (a put-if-not-exists + /// conflict, or an ambiguous error whose target version does not reference our data + /// file) — otherwise it would orphan files a committed manifest still references. + async fn cleanup_staged_manifest_files( + &self, + object_store: &ObjectStore, + data_files: &HashSet, + index_uuids: &[Uuid], + ) { + let data_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + for path in data_files { + let data_path = data_dir.clone().join(path.as_str()); + if let Err(err) = object_store.delete(&data_path).await { + log::warn!( + "Failed to clean up uncommitted manifest rewrite data file '{}': {}", + data_path, + err + ); } - _ => Ok(None), } + self.cleanup_uncommitted_manifest_index_dirs(object_store, index_uuids.iter().copied()) + .await; } - pub(crate) async fn path_has_actual_manifests( + async fn cleanup_uncommitted_manifest_index_dirs( + &self, object_store: &ObjectStore, - table_path: &Path, - ) -> Result { - let versions_path = table_path - .clone() - .join(lance_table::io::commit::VERSIONS_DIR); - // `_versions/` should only contain manifest files, so probing the first entry is enough - // to distinguish declared-only tables (empty `_versions/`) from created tables. - Ok(object_store - .list(Some(versions_path)) - .try_next() - .await? - .is_some()) + index_uuids: impl IntoIterator, + ) { + for index_uuid in index_uuids { + let index_dir = self + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR) + .join(index_uuid.to_string()); + if let Err(err) = object_store.remove_dir_all(index_dir.clone()).await + && !matches!(err, LanceError::NotFound { .. }) + { + log::warn!( + "Failed to clean up uncommitted manifest rewrite index directory '{}': {}", + index_dir, + err + ); + } + } } - async fn location_has_actual_manifests(&self, location: &str) -> Result { - Self::path_has_actual_manifests(&self.object_store, &self.base_path.clone().join(location)) + /// Resolve the commit handler for the `__manifest` dataset's storage backend. + async fn manifest_commit_handler(&self) -> Result> { + commit_handler_from_url(&self.root, &None) + .await + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to resolve manifest commit handler: {:?}", e), + }) + }) + } + + /// Directly write the rewritten `__manifest` as a new version using the storage + /// backend's atomic put-if-not-exists. The overwrite transaction is embedded inline + /// (no separate transaction file) and the commit handler writes the version hint. + async fn commit_manifest_overwrite( + &self, + dataset: &Dataset, + commit_handler: &dyn CommitHandler, + manifest: &mut Manifest, + indices: Option>, + transaction: Transaction, + ) -> std::result::Result<(), CommitError> { + apply_feature_flags(manifest, false, false).map_err(CommitError::from)?; + let timestamp_nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + manifest.set_timestamp(timestamp_nanos); + manifest.update_max_fragment_id(); + + // Commit through the dataset's own object store, not `self.object_store`: for + // stores like `memory://` the namespace and the dataset can hold different + // instances, and a commit written to the wrong one is invisible to reads. + let object_store = dataset + .object_store(None) + .await + .map_err(CommitError::from)?; + let base_path = self.base_path.clone().join(MANIFEST_TABLE_NAME); + let naming_scheme = dataset.manifest_location().naming_scheme; + commit_handler + .commit( + manifest, + indices, + &base_path, + &object_store, + write_manifest_file_to_path, + naming_scheme, + Some((&transaction).into()), + ) + .await + .map(|_location| ()) + } + + /// After an ambiguous commit error, determine whether our overwrite actually landed at + /// `target_version`. A network failure can leave the manifest committed even though the + /// client observed an error; in that case the committed version references one of our + /// staged data files, and deleting them would corrupt the catalog. + async fn manifest_commit_landed( + &self, + dataset: &Dataset, + target_version: u64, + data_files: &HashSet, + ) -> bool { + let Ok(committed) = dataset.checkout_version(target_version).await else { + return false; + }; + committed.manifest().fragments.iter().any(|fragment| { + fragment + .files + .iter() + .any(|file| data_files.contains(file.path.as_str())) + }) + } + + /// Resolve a storage commit conflict against the latest committed catalog state. + /// Returns `Some(output)` when the mutation's intent is already satisfied (no retry + /// needed), `Ok(None)` to retry the rewrite, or an error for a terminal conflict. + async fn resolve_manifest_conflict( + &self, + resolution: &ConflictResolution, + ) -> Result> { + match resolution { + ConflictResolution::Retry => Ok(None), + ConflictResolution::FailIfExists(object_ids) => { + for object_id in object_ids { + if self.manifest_contains_object(object_id).await? { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "Object '{}' was concurrently created by another operation", + object_id + ), + } + .into()); + } + } + Ok(None) + } + ConflictResolution::SucceedIfAbsent { object_id, output } => { + if self.manifest_contains_object(object_id).await? { + Ok(None) + } else { + Ok(Some(output.clone())) + } + } + } + } + + async fn rewrite_manifest( + &self, + operation: &str, + mut make_mutation: F, + ) -> Result + where + M: ManifestStreamMutation + 'static, + F: FnMut() -> M, + { + let _mutation_guard = self.manifest_mutation_lock.lock().await; + let max_retries = self.manifest_rewrite_commit_retries(); + let mut retries = 0; + let build_indices = self.inline_optimization_enabled; + let commit_handler = self.manifest_commit_handler().await?; + + loop { + let dataset_guard = self.manifest_dataset.get_refreshed().await?; + let dataset = Arc::new(dataset_guard.clone()); + drop(dataset_guard); + // Staged files, indices, the commit, and cleanup must all use the dataset's + // own object store (see `commit_manifest_overwrite`). + let object_store = dataset.object_store(None).await?; + + let source = Self::manifest_projected_stream(&dataset).await?; + let resolution = make_mutation().conflict_resolution(); + let shared = Arc::new(StdMutex::new(ManifestRewriteShared::new(make_mutation()))); + let output_stream = Self::manifest_rewrite_output_stream(source, shared.clone()); + // Pin both limits so the overwrite never splits into multiple fragments: the + // replacement indices map each row to address `(0 << 32) | offset`, valid only + // for a single fragment with id 0. The row count is bounded below u32::MAX by + // `ManifestIndexAccumulator::next_row_id`. + let write_params = WriteParams { + mode: WriteMode::Overwrite, + session: self.session.clone(), + max_rows_per_file: u32::MAX as usize, + max_bytes_per_file: usize::MAX, + skip_auto_cleanup: true, + ..WriteParams::default() + }; + + let transaction = match InsertBuilder::new(dataset.clone()) + .with_params(&write_params) + .execute_uncommitted_stream(output_stream) + .await + { + Ok(transaction) => transaction, + Err(err) => { + if let Some(stream_err) = Self::take_manifest_rewrite_error(&shared)? { + return Err(stream_err); + } + return Err(convert_lance_commit_error(&err, operation, None)); + } + }; + + let (mutation, index_data) = Self::take_manifest_rewrite_result(&shared)?; + + let Operation::Overwrite { + fragments, schema, .. + } = &transaction.operation + else { + return Err(NamespaceError::Internal { + message: "Manifest rewrite transaction is not an overwrite".to_string(), + } + .into()); + }; + // Unique data files this attempt staged. Used to clean up orphans and to + // attribute an ambiguous commit error back to us. + let staged_data_files = fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .filter(|file| file.base_id.is_none()) + .map(|file| file.path.clone()) + .collect::>(); + + if !mutation.has_changes { + self.cleanup_staged_manifest_files(&object_store, &staged_data_files, &[]) + .await; + return Ok(mutation.result); + } + + let mut manifest = Self::manifest_from_overwrite_transaction( + dataset.manifest(), + schema.clone(), + fragments, + ); + let target_version = manifest.version; + + let index_uuids = [Uuid::new_v4(), Uuid::new_v4(), Uuid::new_v4()]; + let indices = if build_indices { + match Self::build_manifest_indices(&dataset, &manifest, index_data, index_uuids) + .await + { + Ok(indices) => Some(indices), + Err(err) => { + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + &index_uuids, + ) + .await; + return Err(err); + } + } + } else { + None + }; + let staged_index_uuids: &[Uuid] = if build_indices { &index_uuids } else { &[] }; + + let commit_result = self + .commit_manifest_overwrite( + &dataset, + commit_handler.as_ref(), + &mut manifest, + indices, + transaction, + ) + .await; + + match commit_result { + Ok(()) => { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + Err(err) => { + // The put may have landed even though the client saw an error (lost + // ack). Verify before deleting anything so we never orphan files that a + // committed manifest still references. + if self + .manifest_commit_landed(&dataset, target_version, &staged_data_files) + .await + { + let _ = self.manifest_dataset.get_refreshed().await; + return Ok(mutation.result); + } + self.cleanup_staged_manifest_files( + &object_store, + &staged_data_files, + staged_index_uuids, + ) + .await; + match err { + CommitError::CommitConflict => { + if let Some(output) = + self.resolve_manifest_conflict(&resolution).await? + { + return Ok(output); + } + if retries >= max_retries { + return Err(NamespaceError::ConcurrentModification { + message: format!( + "{}: still conflicting after {} retries", + operation, max_retries + ), + } + .into()); + } + retries += 1; + tokio::time::sleep(std::time::Duration::from_millis( + 10 * u64::from(retries), + )) + .await; + } + CommitError::OtherError(err) => { + return Err(convert_lance_commit_error(&err, operation, None)); + } + } + } + } + } + } + + /// Check if the manifest contains an object with the given ID + async fn manifest_contains_object(&self, object_id: &str) -> Result { + let escaped_id = object_id.replace('\'', "''"); + let filter = format!("object_id = '{}'", escaped_id); + + let dataset_guard = self.manifest_dataset.get().await?; + let mut scanner = dataset_guard.scan(); + + scanner.filter(&filter).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to filter: {:?}", e), + }) + })?; + + // Project no columns and enable row IDs for count_rows to work + scanner.project::<&str>(&[]).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project: {:?}", e), + }) + })?; + + scanner.with_row_id(); + + let count = scanner.count_rows().await.map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to count rows: {:?}", e), + }) + })?; + + Ok(count > 0) + } + + /// Query the manifest for a table with the given object ID + async fn query_manifest_for_table(&self, object_id: &str) -> Result> { + let escaped_id = object_id.replace('\'', "''"); + let filter = format!("object_id = '{}' AND object_type = 'table'", escaped_id); + let mut scanner = self.manifest_scanner().await?; + scanner.filter(&filter).map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to filter: {:?}", e), + }) + })?; + scanner + .project(&["object_id", "location", "metadata"]) + .map_err(|e| { + lance_core::Error::from(NamespaceError::Internal { + message: format!("Failed to project: {:?}", e), + }) + })?; + let batches = Self::execute_scanner(scanner).await?; + + let mut found_result: Option = None; + let mut total_rows = 0; + + for batch in batches { + if batch.num_rows() == 0 { + continue; + } + + total_rows += batch.num_rows(); + if total_rows > 1 { + return Err(NamespaceError::Internal { + message: format!( + "Expected exactly 1 table with id '{}', found {}", + object_id, total_rows + ), + } + .into()); + } + + let object_id_array = Self::get_string_column(&batch, "object_id")?; + let location_array = Self::get_string_column(&batch, "location")?; + let metadata_array = Self::get_string_column(&batch, "metadata")?; + let location = location_array.value(0).to_string(); + let metadata = if !metadata_array.is_null(0) { + let metadata_str = metadata_array.value(0); + match serde_json::from_str::>(metadata_str) { + Ok(map) => Some(map), + Err(e) => { + return Err(NamespaceError::Internal { + message: format!( + "Failed to deserialize metadata for table '{}': {}", + object_id, e + ), + } + .into()); + } + } + } else { + None + }; + let (namespace, name) = Self::parse_object_id(object_id_array.value(0)); + found_result = Some(TableInfo { + namespace, + name, + location, + metadata, + }); + } + + Ok(found_result) + } + + fn serialize_metadata( + properties: Option<&HashMap>, + object_type: &str, + object_id: &str, + ) -> Result> { + match properties { + Some(properties) if !properties.is_empty() => { + serde_json::to_string(properties).map(Some).map_err(|e| { + LanceError::from(NamespaceError::Internal { + message: format!( + "Failed to serialize {} metadata for '{}': {}", + object_type, object_id, e + ), + }) + }) + } + _ => Ok(None), + } + } + + pub(crate) async fn path_has_actual_manifests( + object_store: &ObjectStore, + table_path: &Path, + ) -> Result { + let versions_path = table_path + .clone() + .join(lance_table::io::commit::VERSIONS_DIR); + // `_versions/` should only contain manifest files, so probing the first entry is enough + // to distinguish declared-only tables (empty `_versions/`) from created tables. + Ok(object_store + .list(Some(versions_path)) + .try_next() + .await? + .is_some()) + } + + async fn location_has_actual_manifests(&self, location: &str) -> Result { + Self::path_has_actual_manifests(&self.object_store, &self.base_path.clone().join(location)) .await } @@ -999,7 +2364,6 @@ impl ManifestNamespace { /// Insert one or more entries into the manifest table with metadata and base_objects. /// /// This is the unified entry point for both single and batch inserts. - /// Uses a single MergeInsert operation to insert all entries at once. /// If any entry already exists (matching object_id), the entire batch fails. pub async fn insert_into_manifest_with_metadata( &self, @@ -1029,159 +2393,21 @@ impl ManifestNamespace { return Ok(()); } - let schema = Self::manifest_schema(); - - let mut object_ids = Vec::with_capacity(entries.len()); - let mut object_types = Vec::with_capacity(entries.len()); - let mut locations: Vec> = Vec::with_capacity(entries.len()); - let mut metadatas: Vec> = Vec::with_capacity(entries.len()); - - let string_builder = StringBuilder::new(); - let mut list_builder = ListBuilder::new(string_builder).with_field(Arc::new(Field::new( - "object_id", - DataType::Utf8, - true, - ))); - - for (i, entry) in entries.iter().enumerate() { - object_ids.push(entry.object_id.as_str()); - object_types.push(entry.object_type.as_str()); - locations.push(entry.location.clone()); - metadatas.push(entry.metadata.clone()); - - // Only the first entry gets the base_objects (for single-entry inserts - // with base_objects like view creation); batch entries use null. - if i == 0 { - match &base_objects { - Some(objects) => { - for obj in objects { - list_builder.values().append_value(obj); - } - list_builder.append(true); - } - None => { - list_builder.append_null(); - } - } - } else { - list_builder.append_null(); - } - } - - let base_objects_array = list_builder.finish(); - - let location_array: Arc = Arc::new(StringArray::from( - locations.iter().map(|l| l.as_deref()).collect::>(), - )); - - let metadata_array: Arc = Arc::new(StringArray::from( - metadatas.iter().map(|m| m.as_deref()).collect::>(), - )); - - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(object_ids)), - Arc::new(StringArray::from(object_types.to_vec())), - location_array, - metadata_array, - Arc::new(base_objects_array), - ], - ) - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create manifest entries: {:?}", e), - }) - })?; + self.rewrite_manifest("Failed to overwrite manifest", || { + UpsertManifestMutation::new(entries.clone(), base_objects.clone(), when_matched.clone()) + }) + .await + } - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); - - // Use MergeInsert so callers can choose fail-on-existing inserts or metadata upserts. - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset_arc = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before merge insert - - let mut merge_builder = - MergeInsertBuilder::try_new(dataset_arc, vec!["object_id".to_string()]).map_err( - |e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to create merge builder: {:?}", e), - }) - }, - )?; - merge_builder.when_matched(when_matched); - merge_builder.when_not_matched(WhenNotMatched::InsertAll); - // Use conflict_retries to handle cross-process races on manifest mutations. - merge_builder.conflict_retries(5); - // TODO: after BTREE index creation on object_id, has_scalar_index=true causes - // MergeInsert to use V1 path which lacks bloom filters for conflict detection. This - // results in (Some, None) filter mismatch when rebasing against V2 operations. - // Setting use_index=false ensures all operations consistently use V2 path. - merge_builder.use_index(false); - if let Some(retries) = self.commit_retries { - merge_builder.commit_retries(retries); - } - - let (new_dataset_arc, _merge_stats) = merge_builder - .try_build() - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to build merge: {:?}", e), - }) - })? - .execute_reader(Box::new(reader)) - .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to execute merge insert into manifest", None) - })?; - - let new_dataset = Arc::try_unwrap(new_dataset_arc).unwrap_or_else(|arc| (*arc).clone()); - self.manifest_dataset.set_latest(new_dataset).await; - - // Run inline optimization after write - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(()) - } - - /// Delete an entry from the manifest table - pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { - let predicate = format!("object_id = '{}'", object_id); - - // Get dataset and use DeleteBuilder with configured retries - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); // Drop read guard before delete - - let new_dataset = DeleteBuilder::new(dataset, &predicate) - .execute() - .await - .map_err(|e| convert_lance_commit_error(&e, "Failed to delete", None))?; - - // Update the wrapper with the new dataset - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; - - // Run inline optimization after delete - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(()) - } + /// Delete an entry from the manifest table + pub async fn delete_from_manifest(&self, object_id: &str) -> Result<()> { + let object_id = object_id.to_string(); + self.rewrite_manifest("Failed to delete from manifest", || DeleteObjectMutation { + object_id: object_id.clone(), + deleted: false, + }) + .await + } /// Query the manifest for all versions of a table, sorted by version. /// @@ -1302,90 +2528,55 @@ impl ManifestNamespace { /// `object_type = 'table_version'` entries whose object_id matches /// `{object_id}${zero_padded_version}`. /// - /// Builds a single filter expression covering all version ranges and executes - /// one bulk delete operation instead of deleting versions one at a time. + /// Applies the ranges while streaming the manifest rewrite, without expanding + /// sparse ranges into every possible version object id. pub async fn delete_table_versions( &self, object_id: &str, ranges: &[(i64, i64)], ) -> Result { - if ranges.is_empty() { - return Ok(0); - } - - // Collect all object_ids to delete (both new zero-padded and legacy formats) - let mut object_id_conditions: Vec = Vec::new(); - for (start, end) in ranges { - for version in *start..=*end { - let oid = Self::build_version_object_id(object_id, version); - let escaped = oid.replace('\'', "''"); - object_id_conditions.push(format!("'{}'", escaped)); - } - } - - if object_id_conditions.is_empty() { - return Ok(0); - } - - // First, count how many entries exist so we can report the deleted count - let in_list = object_id_conditions.join(", "); - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); + self.batch_delete_table_versions_by_ranges(&[(object_id.to_string(), ranges.to_vec())]) + .await + } - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), + /// Atomically delete table version entries from the manifest for multiple + /// tables and version ranges. + pub async fn batch_delete_table_versions_by_ranges( + &self, + table_ranges: &[(String, Vec<(i64, i64)>)], + ) -> Result { + let targets = table_ranges + .iter() + .filter_map(|(object_id, ranges)| { + let ranges = Self::normalize_table_version_ranges(ranges); + if ranges.is_empty() { + None + } else { + Some(DeleteTableVersionRangeTarget { + object_id_prefix: Self::build_version_object_id_prefix(object_id), + ranges, + }) + } }) - })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { + .collect::>(); + if targets.is_empty() { return Ok(0); } - // Execute a single bulk delete with the combined filter - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error(&e, "Failed to batch delete table versions", None) - })?; - - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; - - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(deleted_count) + self.rewrite_manifest("Failed to delete table versions from manifest", || { + DeleteTableVersionsMutation { + target: DeleteTableVersionsTarget::Ranges(targets.clone()), + deleted_count: 0, + } + }) + .await } /// Atomically delete table version entries from the manifest by their object_ids. /// /// This method supports multi-table transactional deletion: all specified /// object_ids (which may span multiple tables) are deleted in a single atomic - /// `DeleteBuilder` operation. Either all entries are removed or none are. + /// copy-on-write manifest rewrite. Either all entries are removed or none are. /// /// Object IDs are formatted as `{table_id}${version}`. pub async fn batch_delete_table_versions_by_object_ids( @@ -1396,70 +2587,14 @@ impl ManifestNamespace { return Ok(0); } - let in_list: String = object_ids - .iter() - .map(|oid| { - let escaped = oid.replace('\'', "''"); - format!("'{}'", escaped) - }) - .collect::>() - .join(", "); - - let filter = format!( - "object_type = 'table_version' AND object_id IN ({})", - in_list - ); - - // Count how many entries exist so we can report the deleted count - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - let deleted_count: i64 = batches.iter().map(|b| b.num_rows() as i64).sum(); - - if deleted_count == 0 { - return Ok(0); - } - - // Execute a single atomic bulk delete covering all tables - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - let dataset = Arc::new(dataset_guard.clone()); - drop(dataset_guard); - - let new_dataset = DeleteBuilder::new(dataset, &filter) - .execute() - .await - .map_err(|e| { - convert_lance_commit_error( - &e, - "Failed to batch delete table versions across multiple tables", - None, - ) - })?; - - self.manifest_dataset - .set_latest( - Arc::try_unwrap(new_dataset.new_dataset).unwrap_or_else(|arc| (*arc).clone()), - ) - .await; - - if let Err(e) = self.run_inline_optimization().await { - log::warn!( - "Unexpected failure when running inline optimization: {:?}", - e - ); - } - - Ok(deleted_count) + let object_ids = object_ids.iter().cloned().collect::>(); + self.rewrite_manifest("Failed to delete table versions from manifest", || { + DeleteTableVersionsMutation { + target: DeleteTableVersionsTarget::ObjectIds(object_ids.clone()), + deleted_count: 0, + } + }) + .await } /// Set a property flag in the __manifest table's metadata key-value map. @@ -2866,15 +4001,230 @@ impl LanceNamespace for ManifestNamespace { #[cfg(test)] mod tests { - use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; + use super::{ + BASE_OBJECTS_INDEX_NAME, ConflictResolution, CopyOnWriteMutation, DeleteObjectMutation, + LANCE_DATA_DIR, LANCE_INDICES_DIR, MANIFEST_TABLE_NAME, ManifestBatchBuilder, + ManifestEntry, ManifestIndexAccumulator, ManifestNamespace, ManifestOutputRow, + ManifestRowValue, ManifestStreamMutation, OBJECT_ID_INDEX_NAME, OBJECT_TYPE_INDEX_NAME, + ObjectType, + }; + use crate::DirectoryNamespaceBuilder; + use arrow::datatypes::DataType; use bytes::Bytes; + use futures::StreamExt; + use lance::index::DatasetIndexExt; use lance_core::utils::tempfile::TempStdDir; + use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use lance_namespace::LanceNamespace; use lance_namespace::models::{ CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, ListTablesRequest, TableExistsRequest, }; + use lance_table::format::Fragment; use rstest::rstest; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + async fn create_manifest_namespace( + root: &str, + inline_optimization_enabled: bool, + ) -> ManifestNamespace { + create_manifest_namespace_with_retries(root, inline_optimization_enabled, None).await + } + + async fn create_manifest_namespace_with_retries( + root: &str, + inline_optimization_enabled: bool, + commit_retries: Option, + ) -> ManifestNamespace { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + root, + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + ManifestNamespace::from_directory( + root.to_string(), + None, + None, + object_store, + base_path, + true, + inline_optimization_enabled, + commit_retries, + false, + ) + .await + .unwrap() + } + + struct CommitConflictAfterRewriteMutation { + root: String, + conflict_object_id: String, + } + + impl ManifestStreamMutation for CommitConflictAfterRewriteMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: &row.object_id, + object_type: row.object_type, + location: row.location.as_deref(), + metadata: row.metadata.as_deref(), + base_objects: row.base_objects.as_deref(), + }, + ) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + output.append( + index_data, + ManifestOutputRow { + object_id: "attempted_table", + object_type: ObjectType::Table, + location: Some("attempted_table.lance"), + metadata: None, + base_objects: None, + }, + ) + } + + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let object_id = self.conflict_object_id.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id, + object_type: ObjectType::Table, + location: Some("conflicting_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + }); + }) + .join() + .unwrap(); + CopyOnWriteMutation::updated(()) + } + } + + /// A delete mutation that, during staging, has a concurrent writer delete the same + /// object and commit first, so our own commit hits a conflict while the object is + /// already gone — exercising `ConflictResolution::SucceedIfAbsent`. + struct ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation, + root: String, + target: String, + } + + impl ManifestStreamMutation for ConcurrentDeleteBeforeCommitMutation { + type Output = (); + + fn process_existing_row( + &mut self, + row: ManifestRowValue, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.process_existing_row(row, output, index_data) + } + + fn append_rows( + &mut self, + output: &mut ManifestBatchBuilder, + index_data: &mut ManifestIndexAccumulator, + ) -> lance_core::Result<()> { + self.inner.append_rows(output, index_data) + } + + fn finish(&self) -> CopyOnWriteMutation { + let root = self.root.clone(); + let target = self.target.clone(); + std::thread::spawn(move || { + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async move { + let writer = create_manifest_namespace(&root, false).await; + writer.delete_from_manifest(&target).await.unwrap(); + }); + }) + .join() + .unwrap(); + self.inner.finish() + } + + fn conflict_resolution(&self) -> ConflictResolution { + ConflictResolution::SucceedIfAbsent { + object_id: self.target.clone(), + output: (), + } + } + } + + async fn manifest_base_objects( + manifest_ns: &ManifestNamespace, + ) -> HashMap>> { + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id", "base_objects"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let mut rows = HashMap::new(); + for batch in batches { + let object_ids = ManifestNamespace::get_string_column(&batch, "object_id").unwrap(); + let base_objects = ManifestNamespace::base_objects_column_values(&batch).unwrap(); + for (row, value) in base_objects.into_iter().enumerate() { + rows.insert(object_ids.value(row).to_string(), value); + } + } + rows + } + + async fn manifest_data_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let data_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_DATA_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&data_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); + } + paths + } + + async fn manifest_index_paths(manifest_ns: &ManifestNamespace) -> HashSet { + let index_dir = manifest_ns + .base_path + .clone() + .join(MANIFEST_TABLE_NAME) + .join(LANCE_INDICES_DIR); + let mut stream = manifest_ns.object_store.read_dir_all(&index_dir, None); + let mut paths = HashSet::new(); + while let Some(meta) = stream.next().await.transpose().unwrap() { + paths.insert(meta.location.to_string()); + } + paths + } fn create_test_ipc_data() -> Vec { use arrow::array::{Int32Array, StringArray}; @@ -2906,6 +4256,516 @@ mod tests { buffer } + #[tokio::test] + async fn test_manifest_rewrite_preserves_utf8_metadata_and_base_objects() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "view".to_string(), + object_type: ObjectType::Table, + location: Some("view.lance".to_string()), + metadata: Some(r#"{"kind":"view"}"#.to_string()), + }], + Some(vec!["base_a".to_string(), "base_b".to_string()]), + ) + .await + .unwrap(); + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "other".to_string(), + object_type: ObjectType::Namespace, + location: None, + metadata: Some(r#"{"kind":"namespace"}"#.to_string()), + }], + None, + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let metadata_field = dataset_guard.schema().field("metadata").unwrap(); + assert_eq!(metadata_field.data_type(), DataType::Utf8); + drop(dataset_guard); + + let base_objects = manifest_base_objects(&manifest_ns).await; + assert_eq!( + base_objects.get("view").cloned().unwrap(), + Some(vec!["base_a".to_string(), "base_b".to_string()]) + ); + assert_eq!(base_objects.get("other").cloned().unwrap(), None); + } + + #[tokio::test] + async fn test_manifest_rewrite_replacement_indices_are_versioned() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + Some(vec!["base".to_string()]), + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); + assert!(!index.fragment_bitmap.as_ref().unwrap().is_empty()); + } + } + + #[tokio::test] + async fn test_manifest_rewrite_empty_manifest_keeps_replacement_indices_valid() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, true).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + manifest_ns.delete_from_manifest("table").await.unwrap(); + + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner.project(&["object_id"]).unwrap(); + let rows = ManifestNamespace::execute_scanner(scanner) + .await + .unwrap() + .into_iter() + .map(|batch| batch.num_rows()) + .sum::(); + assert_eq!(rows, 0); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let dataset_version = dataset_guard.version().version; + let indices = dataset_guard.load_indices().await.unwrap(); + let names = indices + .iter() + .map(|index| index.name.as_str()) + .collect::>(); + assert!(names.contains(OBJECT_ID_INDEX_NAME)); + assert!(names.contains(OBJECT_TYPE_INDEX_NAME)); + assert!(names.contains(BASE_OBJECTS_INDEX_NAME)); + for index in indices.iter() { + assert_eq!(index.dataset_version, dataset_version); + } + } + + #[tokio::test] + async fn test_manifest_rewrite_fragment_bitmap_uses_overwrite_fragment_ids() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let fragments = vec![Fragment::new(0), Fragment::new(0), Fragment::new(7)]; + + let manifest = ManifestNamespace::manifest_from_overwrite_transaction( + dataset_guard.manifest(), + dataset_guard.manifest().schema.clone(), + &fragments, + ); + + let fragment_ids = manifest + .fragments + .iter() + .map(|fragment| fragment.id) + .collect::>(); + assert_eq!(fragment_ids, vec![0, 1, 7]); + assert_eq!( + ManifestNamespace::manifest_fragment_bitmap(&manifest) + .unwrap() + .into_iter() + .collect::>(), + vec![0, 1, 7] + ); + } + + #[tokio::test] + async fn test_manifest_delete_table_versions_by_ranges() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + let table_id = "table"; + let entries = (1..=5) + .map(|version| ManifestEntry { + object_id: ManifestNamespace::build_version_object_id(table_id, version), + object_type: ObjectType::TableVersion, + location: None, + metadata: Some( + serde_json::json!({ + "manifest_path": format!("_versions/{}.manifest", version), + }) + .to_string(), + ), + }) + .collect::>(); + manifest_ns + .insert_into_manifest_with_metadata(entries, None) + .await + .unwrap(); + + let deleted = manifest_ns + .delete_table_versions(table_id, &[(2, 3), (5, 5)]) + .await + .unwrap(); + assert_eq!(deleted, 3); + + let remaining = manifest_ns + .query_table_versions(table_id, false, None) + .await + .unwrap() + .into_iter() + .map(|(version, _)| version) + .collect::>(); + assert_eq!(remaining, vec![1, 4]); + } + + #[tokio::test] + async fn test_manifest_delete_table_versions_by_object_ids() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + let table_id = "table"; + let entries = (1..=3) + .map(|version| ManifestEntry { + object_id: ManifestNamespace::build_version_object_id(table_id, version), + object_type: ObjectType::TableVersion, + location: None, + metadata: Some( + serde_json::json!({ + "manifest_path": format!("_versions/{}.manifest", version), + }) + .to_string(), + ), + }) + .collect::>(); + manifest_ns + .insert_into_manifest_with_metadata(entries, None) + .await + .unwrap(); + + let object_ids = vec![ + ManifestNamespace::build_version_object_id(table_id, 1), + ManifestNamespace::build_version_object_id(table_id, 3), + ]; + let deleted = manifest_ns + .batch_delete_table_versions_by_object_ids(&object_ids) + .await + .unwrap(); + assert_eq!(deleted, 2); + + let remaining = manifest_ns + .query_table_versions(table_id, false, None) + .await + .unwrap() + .into_iter() + .map(|(version, _)| version) + .collect::>(); + assert_eq!(remaining, vec![2]); + } + + #[tokio::test] + async fn test_manifest_noop_delete_uses_latest_snapshot() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let stale_ns = create_manifest_namespace(temp_path, false).await; + let writer_ns = create_manifest_namespace(temp_path, false).await; + + writer_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "late_table".to_string(), + object_type: ObjectType::Table, + location: Some("late_table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + stale_ns.delete_from_manifest("late_table").await.unwrap(); + + let check_ns = create_manifest_namespace(temp_path, false).await; + assert!( + !check_ns + .manifest_contains_object("late_table") + .await + .unwrap() + ); + } + + #[tokio::test] + async fn test_manifest_noop_delete_cleans_uncommitted_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let before = manifest_data_paths(&manifest_ns).await; + assert!(!before.is_empty()); + + manifest_ns + .delete_from_manifest("missing_table") + .await + .unwrap(); + + let after = manifest_data_paths(&manifest_ns).await; + assert_eq!(after, before); + } + + #[tokio::test] + async fn test_manifest_final_commit_failure_cleans_uncommitted_rewrite_files() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, true, Some(0)).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let before_data_paths = manifest_data_paths(&manifest_ns).await; + let before_index_paths = manifest_index_paths(&manifest_ns).await; + + let result = manifest_ns + .rewrite_manifest("Failed to test manifest cleanup", || { + CommitConflictAfterRewriteMutation { + root: temp_path.to_string(), + conflict_object_id: "conflicting_table".to_string(), + } + }) + .await; + assert!(result.is_err()); + + let after_data_paths = manifest_data_paths(&manifest_ns).await; + assert!(before_data_paths.is_subset(&after_data_paths)); + assert_eq!(after_data_paths.len(), before_data_paths.len() + 1); + assert_eq!(manifest_index_paths(&manifest_ns).await, before_index_paths); + assert!( + manifest_ns + .manifest_contains_object("conflicting_table") + .await + .unwrap() + ); + assert!( + !manifest_ns + .manifest_contains_object("attempted_table") + .await + .unwrap() + ); + } + + #[tokio::test] + async fn test_manifest_commit_visible_on_memory_store() { + // Regression: the commit must use the same object store the manifest dataset reads + // from. On `memory://` the namespace store and the dataset store can be different + // in-memory instances, so a commit written to the wrong one is invisible to reads + // (manifests as stale version -> endless conflict / "not found"). + let manifest_ns = create_manifest_namespace("memory://test_commit_visible", false).await; + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + // A second sequential commit must not falsely conflict. + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table2".to_string(), + object_type: ObjectType::Table, + location: Some("table2.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!( + manifest_ns + .manifest_contains_object("table2") + .await + .unwrap() + ); + } + + #[tokio::test] + async fn test_manifest_commit_uses_inline_transaction() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let dataset_guard = manifest_ns.manifest_dataset.get().await.unwrap(); + let manifest = dataset_guard.manifest(); + // The overwrite transaction is embedded inline in the manifest, never written as a + // separate _transactions/*.txn file. + assert!(manifest.transaction_section.is_some()); + assert!(manifest.transaction_file.is_none()); + } + + #[tokio::test] + async fn test_manifest_commit_landed_attributes_data_file() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace(temp_path, false).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + + let dataset = Arc::new(manifest_ns.manifest_dataset.get().await.unwrap().clone()); + let version = dataset.manifest().version; + let our_files = dataset + .manifest() + .fragments + .iter() + .flat_map(|fragment| fragment.files.iter()) + .map(|file| file.path.clone()) + .collect::>(); + assert!(!our_files.is_empty()); + + // The committed version references our data file => attributed to us (a lost-ack + // commit must be treated as success, not cleaned up). + assert!( + manifest_ns + .manifest_commit_landed(&dataset, version, &our_files) + .await + ); + // A different file set is not attributed to us. + let other = HashSet::from(["missing.lance".to_string()]); + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version, &other) + .await + ); + // A version that does not exist did not land. + assert!( + !manifest_ns + .manifest_commit_landed(&dataset, version + 100, &our_files) + .await + ); + } + + #[tokio::test] + async fn test_manifest_delete_conflict_with_concurrent_delete_succeeds() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_with_retries(temp_path, false, Some(0)).await; + + manifest_ns + .insert_into_manifest_with_metadata( + vec![ManifestEntry { + object_id: "table".to_string(), + object_type: ObjectType::Table, + location: Some("table.lance".to_string()), + metadata: None, + }], + None, + ) + .await + .unwrap(); + assert!(manifest_ns.manifest_contains_object("table").await.unwrap()); + + // A concurrent writer deletes "table" and commits first, so our own delete commit + // conflicts while "table" is already gone. Native resolution treats the goal as + // achieved and succeeds instead of erroring or retrying forever. + let result = manifest_ns + .rewrite_manifest("Failed to delete from manifest", || { + ConcurrentDeleteBeforeCommitMutation { + inner: DeleteObjectMutation { + object_id: "table".to_string(), + deleted: false, + }, + root: temp_path.to_string(), + target: "table".to_string(), + } + }) + .await; + + assert!(result.is_ok(), "delete should succeed: {result:?}"); + assert!(!manifest_ns.manifest_contains_object("table").await.unwrap()); + } + #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] @@ -3939,9 +5799,9 @@ mod tests { /// Test that concurrent create_table calls for the same table name don't /// create duplicate entries in the manifest. Uses two independent /// ManifestNamespace instances pointing at the same directory to simulate - /// two separate OS processes racing on table creation. The conflict_retries - /// setting on the MergeInsert ensures the second operation properly detects - /// the duplicate via WhenMatched::Fail after retrying against the latest data. + /// two separate OS processes racing on table creation. Copy-on-write rewrite + /// retries ensure the second operation detects the duplicate after retrying + /// against the latest data. #[tokio::test] async fn test_concurrent_create_table_no_duplicates() { let temp_dir = TempStdDir::default(); From d3f0db3a56993aeba035119e1022fa239c8e626b Mon Sep 17 00:00:00 2001 From: Brendan Clement Date: Thu, 11 Jun 2026 10:59:12 -0700 Subject: [PATCH 088/177] fix: branch support on object stores, python branch ops, s3/ddb integration tests (#7219) Adds the first integration coverage for table branches (localstack S3 + DynamoDB) and fixes the four bugs it caught. All four were invisible to local-FS tests. - Branch locations broke uris that carry query strings. BranchLocation appended tree/ after the query string, so s3+ddb://...?ddbTableName=x produced a branch uri that re-parses to the main table path. Branches now work end to end over s3+ddb://. - create_branch dropped the dataset's store params. It passed store_params.unwrap_or_default() to CommitBuilder, so the commit's existence probe built an object store with no endpoint or region. It now falls back to the dataset's own params, matching shallow_clone. - The managed manifest store treated a missing chain as a fatal error. Probing a branch location before the branch exists surfaced the namespace's not-found instead of "no versions". get_latest_version now reports a missing table or branch chain as None, matching the ExternalManifestStore contract and the store's own behavior for empty main chains. - The Python namespace adapter flattened error identity. Exceptions from custom Python namespaces all arrived in Rust as opaque IO strings. The adapter now reconstructs the NamespaceError from the lance_namespace exception's error code, gated on the exception actually being a LanceNamespaceError so a foreign exception that happens to carry an integer code is never reinterpreted (it propagates as itself). Custom namespaces now raise the same typed exceptions as the native ones. --- python/pyproject.toml | 2 +- python/python/lance/dataset.py | 4 + python/python/lance/namespace.py | 48 +++++++ python/python/tests/test_dataset.py | 3 + python/python/tests/test_namespace_dir.py | 127 ++++++++++++++++++ .../tests/test_namespace_integration.py | 96 +++++++++++++ python/python/tests/test_s3_ddb.py | 52 +++++++ python/src/namespace.rs | 102 +++++++++++++- python/uv.lock | 14 +- rust/lance/src/dataset.rs | 5 +- rust/lance/src/dataset/branch_location.rs | 59 ++++++-- .../lance/src/io/commit/namespace_manifest.rs | 116 +++++++++++++++- 12 files changed, 609 insertions(+), 19 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index a1e69855a0f..d2efab23579 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.0,<0.9"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.8.5,<0.9"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index e96d9305ce5..dae72b88b1c 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -950,6 +950,9 @@ def create_branch( ds._base_store_params = self._base_store_params ds._namespace_client = self._namespace_client ds._table_id = self._table_id + ds._namespace_client_managed_versioning = ( + self._namespace_client_managed_versioning + ) ds._default_scan_options = self._default_scan_options ds._read_params = self._read_params return ds @@ -4579,6 +4582,7 @@ def commit_batch( ds._base_store_params = base_store_params ds._namespace_client = None ds._table_id = None + ds._namespace_client_managed_versioning = False ds._default_scan_options = None ds._read_params = None return BulkCommitResult( diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index f448e5c3368..fec3a1cfb1e 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -32,6 +32,8 @@ CreateMaterializedViewResponse, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -42,6 +44,8 @@ DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, @@ -70,6 +74,8 @@ LanceNamespace, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -850,6 +856,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: @@ -1420,6 +1447,27 @@ def update_table_tag( response_dict = self._inner.update_table_tag(request.model_dump()) return UpdateTableTagResponse.from_dict(response_dict) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + """Create a new branch forked from a table version.""" + response_dict = self._inner.create_table_branch(request.model_dump()) + return CreateTableBranchResponse.from_dict(response_dict) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + """List all branches of a table.""" + response_dict = self._inner.list_table_branches(request.model_dump()) + return ListTableBranchesResponse.from_dict(response_dict) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + """Delete a branch from a table.""" + response_dict = self._inner.delete_table_branch(request.model_dump()) + return DeleteTableBranchResponse.from_dict(response_dict) + # Operation metrics methods def retrieve_ops_metrics(self) -> Dict[str, int]: diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 4af363868e1..89bd78b82c8 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -1742,6 +1742,7 @@ def test_commit_batch_append(): result = lance.LanceDataset.commit_batch(dataset, [txn2, txn3]) dataset = result["dataset"] assert dataset.version == 2 + assert dataset.checkout_version(1).version == 1 assert len(dataset.get_fragments()) == 3 assert dataset.to_table() == pa.concat_tables([data1, data2, data3]) merged_txn = result["merged"] @@ -5538,6 +5539,8 @@ def test_branches(tmp_path: Path): branch1 = ds_main.create_branch("branch1") ds_main.branches.replace_metadata("branch1", {"description": "branch one"}) assert branch1.version == 1 + # The dataset returned by create_branch must be fully constructed + assert branch1.checkout_version(("main", None)).version == 1 branch1_append = pa.Table.from_pydict({"a": [7, 8], "b": [9, 10]}) branch1 = lance.write_dataset(branch1_append, branch1, mode="append") assert branch1.version == 2 diff --git a/python/python/tests/test_namespace_dir.py b/python/python/tests/test_namespace_dir.py index 1991b82946e..a57879d3368 100644 --- a/python/python/tests/test_namespace_dir.py +++ b/python/python/tests/test_namespace_dir.py @@ -29,6 +29,8 @@ CountTableRowsRequest, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, @@ -37,6 +39,8 @@ CreateTableVersionResponse, DeclareTableRequest, DeclareTableResponse, + DeleteTableBranchRequest, + DeleteTableBranchResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, @@ -54,6 +58,8 @@ InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableBranchesRequest, + ListTableBranchesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, @@ -71,6 +77,8 @@ InvalidInputError, NamespaceNotEmptyError, NamespaceNotFoundError, + TableBranchAlreadyExistsError, + TableBranchNotFoundError, TableNotFoundError, ) @@ -151,6 +159,21 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + + def list_table_branches( + self, request: ListTableBranchesRequest + ) -> ListTableBranchesResponse: + return self._inner.list_table_branches(request) + + def delete_table_branch( + self, request: DeleteTableBranchRequest + ) -> DeleteTableBranchResponse: + return self._inner.delete_table_branch(request) + def create_table_index( self, request: CreateTableIndexRequest ) -> CreateTableIndexResponse: @@ -564,6 +587,110 @@ def test_register_table_rejects_path_traversal(self, temp_ns_client): assert "Path traversal is not allowed" in str(exc_info.value) +class TestTableBranchOperations: + """Branch CRUD through the python bindings - mirrors the Rust branch + CRUD tests.""" + + def test_branch_crud_round_trip(self, temp_ns_client): + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "branched_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" in listed.branches + assert listed.branches["dev"].parent_version == 1 + + # Duplicate creation and deleting a missing branch surface the typed + # branch errors (codes 23 and 22), not InternalError. + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + with pytest.raises(TableBranchAlreadyExistsError): + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev2") + ) + + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert "dev" not in listed.branches + with pytest.raises(TableBranchNotFoundError): + temp_ns_client.delete_table_branch( + DeleteTableBranchRequest(id=table_id, name="dev") + ) + + def test_create_branch_from_other_branch(self, temp_ns_client): + """Forking from a non-main source branch records the right parent.""" + create_ns_req = CreateNamespaceRequest(id=["workspace"]) + temp_ns_client.create_namespace(create_ns_req) + ipc_data = table_to_ipc_bytes(create_test_data()) + table_id = ["workspace", "fork_table"] + temp_ns_client.create_table(CreateTableRequest(id=table_id), ipc_data) + + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev") + ) + temp_ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="child", from_branch="dev") + ) + listed = temp_ns_client.list_table_branches( + ListTableBranchesRequest(id=table_id) + ) + assert listed.branches["child"].parent_branch == "dev" + + +class _ForeignCodeError(Exception): + """Not a LanceNamespaceError, but carries the same integer code as + TABLE_NOT_FOUND.""" + + code = 4 + + +class _RaisingNamespace(LanceNamespace): + """A namespace whose describe_table raises the configured exception.""" + + def __init__(self, exc: Exception): + self._exc = exc + + def namespace_id(self) -> str: + return "raising" + + def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse: + raise self._exc + + +class TestPythonNamespaceErrorMapping: + """The Rust adapter must trust the `code` attribute only on the + lance_namespace exception hierarchy.""" + + def test_namespace_error_identity_preserved(self): + ns = _RaisingNamespace(TableNotFoundError("no such table")) + with pytest.raises(TableNotFoundError, match="no such table"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + # Branch error codes (22/23) survive the round trip too. + ns = _RaisingNamespace(TableBranchNotFoundError("no such branch")) + with pytest.raises(TableBranchNotFoundError, match="no such branch"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + def test_foreign_code_attribute_not_trusted(self): + # The foreign exception must surface as itself, not be reinterpreted + # as a namespace error via its `code` attribute. + ns = _RaisingNamespace(_ForeignCodeError("boom")) + with pytest.raises(_ForeignCodeError, match="boom"): + lance.dataset(namespace_client=ns, table_id=["t"]) + + class TestChildNamespaceOperations: """Tests for operations in child namespaces - mirrors Rust tests.""" diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 4605b755816..fc08370d247 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -31,6 +31,8 @@ from lance_namespace import ( CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableBranchRequest, + CreateTableBranchResponse, CreateTableRequest, CreateTableResponse, CreateTableVersionRequest, @@ -136,6 +138,11 @@ def create_table_version( ) -> CreateTableVersionResponse: return self._inner.create_table_version(request) + def create_table_branch( + self, request: CreateTableBranchRequest + ) -> CreateTableBranchResponse: + return self._inner.create_table_branch(request) + def retrieve_ops_metrics(self) -> Optional[Dict[str, int]]: return self._inner.retrieve_ops_metrics() @@ -199,6 +206,7 @@ def create_tracking_namespace( storage_options: dict, credential_expires_in_seconds: int = 60, use_custom: bool = False, + managed_versioning: bool = False, ): """Create a DirectoryNamespace with ops metrics and credential vending enabled. @@ -212,6 +220,9 @@ def create_tracking_namespace( storage_options: Storage options to pass through (credentials, endpoint, etc.) credential_expires_in_seconds: Interval in seconds for credential expiration use_custom: If True, wrap in CustomNamespace for testing custom implementations + managed_versioning: If True, enable the manifest catalog so table versions + are tracked by the namespace and commits route through + create_table_version Returns: Tuple of (namespace_client, inner_namespace_client) where inner is always @@ -238,6 +249,10 @@ def create_tracking_namespace( dir_props["vend_input_storage_options_refresh_interval_millis"] = str( credential_expires_in_seconds * 1000 ) + if managed_versioning: + dir_props["manifest_enabled"] = "true" + dir_props["table_version_tracking_enabled"] = "true" + dir_props["table_version_storage_enabled"] = "true" inner_ns_client = DirectoryNamespace(**dir_props) ns_client = _wrap_if_custom(inner_ns_client, use_custom) @@ -558,6 +573,87 @@ def test_namespace_write_overwrite_mode(s3_bucket: str, use_custom: bool): assert get_describe_call_count(inner_ns_client) == call_count_before_reads +@pytest.mark.integration +@pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) +def test_namespace_managed_branches(s3_bucket: str, use_custom: bool): + """Branches on a managed-versioning table over S3. + + Branch commits must route through the catalog (create_table_version) and + leave main's chain untouched. A cross-branch checkout at an overlapping + version number must resolve the requested chain: branch version numbers + continue from the fork point, so the same number exists on both chains + with different data. + """ + storage_options = copy.deepcopy(CONFIG) + + ns_client, inner_ns_client = create_tracking_namespace( + bucket_name=s3_bucket, + storage_options=storage_options, + credential_expires_in_seconds=3600, + use_custom=use_custom, + managed_versioning=True, + ) + + table_name = uuid.uuid4().hex + table_id = ["test_ns", table_name] + + def commit_count() -> int: + return inner_ns_client.retrieve_ops_metrics().get("create_table_version", 0) + + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), + namespace_client=ns_client, + table_id=table_id, + mode="create", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert commit_count() >= 2 + + ns_client.create_table_branch( + CreateTableBranchRequest(id=table_id, name="dev", from_version=2) + ) + + dev = ds.checkout_version(("dev", None)) + commits_before_branch_append = commit_count() + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + assert commit_count() == commits_before_branch_append + 1 + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + + # Diverge main to the same version number as dev's tip. + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + namespace_client=ns_client, + table_id=table_id, + mode="append", + storage_options=storage_options, + ) + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + fresh = lance.dataset( + namespace_client=ns_client, + table_id=table_id, + storage_options=storage_options, + ) + assert sorted(fresh.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration @pytest.mark.parametrize("use_custom", [False, True], ids=["DirectoryNS", "CustomNS"]) def test_namespace_distributed_write(s3_bucket: str, use_custom: bool): diff --git a/python/python/tests/test_s3_ddb.py b/python/python/tests/test_s3_ddb.py index b9c9e4be6c0..dc9744115e2 100644 --- a/python/python/tests/test_s3_ddb.py +++ b/python/python/tests/test_s3_ddb.py @@ -212,6 +212,58 @@ def writh_dataset_with_start_barrier(): assert lance.dataset(table_dir).count_rows() == expected_version * 2 +@pytest.mark.integration +def test_s3_ddb_branches(s3_bucket: str, ddb_table: str): + """Branches on a table committed through the DynamoDB external manifest + store. + + The DDB store keys version chains by base uri, so each branch chain must + get its own entries via its branch-qualified path. Both chains are given + the same version number with diverged data so a wrong-chain resolution + cannot pass silently. + """ + storage_options = copy.deepcopy(CONFIG) + table_name = uuid.uuid4().hex + table_dir = f"s3+ddb://{s3_bucket}/{table_name}?ddbTableName={ddb_table}" + + # main: v1 (a=1), v2 (a=2) + lance.write_dataset( + pa.Table.from_pylist([{"a": 1}]), table_dir, storage_options=storage_options + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 2}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + # Fork "dev" at v2 and commit on it, then diverge main to the same + # version number. + dev = ds.create_branch("dev", 2) + dev = lance.write_dataset( + pa.Table.from_pylist([{"a": 3}]), + dev, + mode="append", + storage_options=storage_options, + ) + ds = lance.write_dataset( + pa.Table.from_pylist([{"a": 100}]), + table_dir, + mode="append", + storage_options=storage_options, + ) + + assert sorted(dev.to_table()["a"].to_pylist()) == [1, 2, 3] + assert sorted(ds.to_table()["a"].to_pylist()) == [1, 2, 100] + + # Cross-branch checkout at the overlapping version number resolves each + # chain's own data. + on_dev = ds.checkout_version(("dev", 3)) + assert sorted(on_dev.to_table()["a"].to_pylist()) == [1, 2, 3] + back_on_main = dev.checkout_version(("main", None)) + assert sorted(back_on_main.to_table()["a"].to_pylist()) == [1, 2, 100] + + @pytest.mark.integration def test_s3_unsafe(s3_bucket: str): storage_options = copy.deepcopy(CONFIG) diff --git a/python/src/namespace.rs b/python/src/namespace.rs index cf5f7c41b0f..e88ff40de2c 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -392,6 +392,44 @@ impl PyDirectoryNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1054,6 +1092,44 @@ impl PyRestNamespace { pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) } + // Table branch operations + + fn create_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.create_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn list_table_branches<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.list_table_branches(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + + fn delete_table_branch<'py>( + &self, + py: Python<'py>, + request: &Bound<'_, PyAny>, + ) -> PyResult> { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.delete_table_branch(request))? + .infer_error()?; + pythonize(py, &response).map_err(|e| pyo3::exceptions::PyValueError::new_err(e.to_string())) + } + // Data manipulation operations fn count_table_rows(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { @@ -1472,6 +1548,30 @@ fn get_dict_with_model_dump_class(py: Python<'_>) -> PyResult> Ok(class) } +/// Convert a Python namespace exception into a lance error, preserving the +/// namespace error identity when the exception is a `lance_namespace` +/// `LanceNamespaceError` carrying an error `code`, so callers can react to +/// e.g. TableNotFound the same way they do for native clients. Foreign +/// exceptions that happen to carry an integer `code` (e.g. SystemExit) must +/// not be reinterpreted, so the extraction is gated on the exception type. +fn namespace_error_from_py(method_name: &'static str, e: PyErr) -> lance_core::Error { + Python::attach(|py| { + let value = e.value(py); + let is_namespace_error = py + .import("lance_namespace.errors") + .and_then(|module| module.getattr("LanceNamespaceError")) + .and_then(|class| value.is_instance(&class)) + .unwrap_or(false); + if is_namespace_error + && let Ok(code) = value.getattr("code").and_then(|code| code.extract::()) + { + return lance_namespace::error::NamespaceError::from_code(code, value.to_string()) + .into(); + } + lance_core::Error::io(format!("Python error in {}: {}", method_name, e)) + }) +} + /// Helper to call a Python namespace method with JSON serialization. /// For methods that take a request and return a response. /// Uses DictWithModelDump to pass a dict that also has model_dump() method, @@ -1519,7 +1619,7 @@ where }) .await .map_err(|e| lance_core::Error::io(format!("Task join error for {}: {}", method_name, e)))? - .map_err(|e: PyErr| lance_core::Error::io(format!("Python error in {}: {}", method_name, e)))?; + .map_err(|e: PyErr| namespace_error_from_py(method_name, e))?; serde_json::from_str(&response_json).map_err(|e| { lance_core::Error::io(format!( diff --git a/python/uv.lock b/python/uv.lock index 428578ab26f..289ecdf3549 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1083,19 +1083,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.8.4" +version = "0.8.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/48/8f/8a03395587a78cfaf92f7307ad931f61eb515af67705c704bd6c7af2f745/lance_namespace-0.8.4.tar.gz", hash = "sha256:1a54ad49e7ace25a629c5f2c99d393629742eceeeb16ba2f51a771ccb350e284", size = 11282, upload-time = "2026-06-10T19:07:21.919Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/22/3d8eb4e913edf36cda416f1dca287147af508abe3ca89bf0e619b9fa9f54/lance_namespace-0.8.5.tar.gz", hash = "sha256:b4a5967afcbf9924300a0b9d2fb74c44a23f76907e8734ebed6e0e3a561b0df0", size = 11531, upload-time = "2026-06-11T16:20:26.77Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/4b/218c67cafb707024069925ce86534588861a464aaa327f7a457b94eed3c2/lance_namespace-0.8.4-py3-none-any.whl", hash = "sha256:8b347eef4b7c7187a1b52f388b5dcc345fed0bf4ea87728188dcb11a52619d0b", size = 13111, upload-time = "2026-06-10T19:07:22.6Z" }, + { url = "https://files.pythonhosted.org/packages/c0/da/afc3cdc42fc2dcf885a9d3524bf2c3bd2a9df89b1668b1806dec5e436263/lance_namespace-0.8.5-py3-none-any.whl", hash = "sha256:6d3e2b8da586d06409494b56955a63c3152eeae2883cd2e8ba4e80d20dc0de0f", size = 13383, upload-time = "2026-06-11T16:20:26.004Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.8.4" +version = "0.8.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -1104,9 +1104,9 @@ dependencies = [ { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0a/55/4a7cc7e5d19bda170c896a6adff2ec925c533df812b91bce2bc8f7aea30b/lance_namespace_urllib3_client-0.8.4.tar.gz", hash = "sha256:1a292a83509ab79475da967b78839e9ead4ab973064d37d1ba1575b23ffdacef", size = 228485, upload-time = "2026-06-10T19:07:19.863Z" } +sdist = { url = "https://files.pythonhosted.org/packages/44/6f/1291523488523656342d1b424b76b4d91f3af6413b3b4ada43b888a87043/lance_namespace_urllib3_client-0.8.5.tar.gz", hash = "sha256:29922ffb5b0621e24a83183454ec3e5a5828f46d91a95d58efc35db05dec4e62", size = 228595, upload-time = "2026-06-11T16:20:23.985Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b4/f7/70dd2fc1f9ef462d3802b4cffcd64f2b9233a9907d6071e8694338492608/lance_namespace_urllib3_client-0.8.4-py3-none-any.whl", hash = "sha256:37ee1d74614fae6358f50e3589ac26c29379ffb1346f09c4f5ec8953f823cefd", size = 369807, upload-time = "2026-06-10T19:07:21.001Z" }, + { url = "https://files.pythonhosted.org/packages/10/e2/62883d1f43a283ac08f00af993c6a2b92e4ca206fa1ccba032420d8dc578/lance_namespace_urllib3_client-0.8.5-py3-none-any.whl", hash = "sha256:8af211ddc6e73df713ffb59368c94780508e732b19dacb4239d937aaff2f8e3c", size = 369857, upload-time = "2026-06-11T16:20:25.006Z" }, ] [[package]] @@ -2676,7 +2676,7 @@ requires-dist = [ { name = "duckdb", marker = "extra == 'tests'" }, { name = "geoarrow-rust-core", marker = "extra == 'geo'" }, { name = "geoarrow-rust-io", marker = "extra == 'geo'" }, - { name = "lance-namespace", specifier = ">=0.8.0,<0.9" }, + { name = "lance-namespace", specifier = ">=0.8.5,<0.9" }, { name = "ml-dtypes", marker = "extra == 'tests'" }, { name = "numpy", specifier = ">=1.22" }, { name = "pandas", marker = "extra == 'tests'" }, diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index c9cc356aaa6..f6fd1ef6a20 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -514,7 +514,10 @@ impl Dataset { let transaction = Transaction::new(version_number, clone_op, None); let builder = CommitBuilder::new(WriteDestination::Uri(branch_location.uri.as_str())) - .with_store_params(store_params.unwrap_or_default()) + // Fall back to the dataset's own store params + .with_store_params( + store_params.unwrap_or(self.store_params.as_deref().cloned().unwrap_or_default()), + ) .with_object_store(Arc::new(self.object_store.as_ref().clone())) .with_commit_handler(self.commit_handler.clone()) .with_storage_format(self.manifest.data_storage_format.lance_file_version()?); diff --git a/rust/lance/src/dataset/branch_location.rs b/rust/lance/src/dataset/branch_location.rs index 3a1185c8cf8..7ebce36ec86 100644 --- a/rust/lance/src/dataset/branch_location.rs +++ b/rust/lance/src/dataset/branch_location.rs @@ -31,14 +31,20 @@ impl BranchLocation { } fn get_root_path(path_str: &str, branch_name: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // the branch suffix sits on the path part, before the query. + let (path_part, query) = match path_str.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (path_str, None), + }; let branch_suffix = format!("{}/{}", BRANCH_DIR, branch_name); let branch_suffix = branch_suffix.as_str(); - let root_path_str = path_str + let root_path_str = path_part .strip_suffix(branch_suffix) .or_else(|| { if cfg!(windows) { let windows_suffix = branch_suffix.replace('/', "\\"); - path_str.strip_suffix(&windows_suffix) + path_part.strip_suffix(&windows_suffix) } else { None } @@ -59,7 +65,10 @@ impl BranchLocation { root_path_str, path_str, ))); }; - Ok(root_path_str) + Ok(match query { + Some(query) => format!("{}?{}", root_path_str, query), + None => root_path_str, + }) } /// The branch a location under `root` targets: the inverse of @@ -132,13 +141,23 @@ impl BranchLocation { } fn join_str(base: &str, segment: &str) -> Result { + // A uri may carry a query string (e.g. `s3+ddb://...?ddbTableName=t`); + // path segments must be appended before it. + let (path_part, query) = match base.split_once('?') { + Some((path, query)) => (path, Some(query)), + None => (base, None), + }; let normalized_segment = segment.trim_start_matches('/'); - let is_base_dir = base.ends_with("/"); - if is_base_dir { - Ok(format!("{}{}", base, normalized_segment)) + let is_base_dir = path_part.ends_with("/"); + let joined = if is_base_dir { + format!("{}{}", path_part, normalized_segment) } else { - Ok(format!("{}/{}", base, normalized_segment)) - } + format!("{}/{}", path_part, normalized_segment) + }; + Ok(match query { + Some(query) => format!("{}?{}", joined, query), + None => joined, + }) } } @@ -255,6 +274,30 @@ mod tests { assert!(fs::create_dir_all(std::path::Path::new(new_location.uri.as_str())).is_ok()); } + #[test] + fn test_branch_location_with_query_uri() { + // Uris like `s3+ddb://...?ddbTableName=t` carry the commit handler + // config in the query string; branch path segments must be inserted + // before it and the query must survive the round trip. + let location = BranchLocation { + path: Path::parse("bucket/table.lance").unwrap(), + uri: "s3+ddb://bucket/table.lance?ddbTableName=t".to_string(), + branch: None, + }; + let dev = location.find_branch(Some("dev")).unwrap(); + assert_eq!( + dev.uri, + "s3+ddb://bucket/table.lance/tree/dev?ddbTableName=t" + ); + assert_eq!(dev.path.as_ref(), "bucket/table.lance/tree/dev"); + assert_eq!(dev.branch.as_deref(), Some("dev")); + + let main = dev.find_main().unwrap(); + assert_eq!(main.uri, "s3+ddb://bucket/table.lance?ddbTableName=t"); + assert_eq!(main.path.as_ref(), "bucket/table.lance"); + assert_eq!(main.branch, None); + } + #[test] fn test_branch_of() { let derive = |root: &str, location: &str| BranchLocation::branch_of(root, location); diff --git a/rust/lance/src/io/commit/namespace_manifest.rs b/rust/lance/src/io/commit/namespace_manifest.rs index 92d5e7bc789..f4f012adcca 100644 --- a/rust/lance/src/io/commit/namespace_manifest.rs +++ b/rust/lance/src/io/commit/namespace_manifest.rs @@ -14,8 +14,24 @@ use lance_table::io::commit::{ManifestLocation, ManifestNamingScheme}; use object_store::ObjectStore as OSObjectStore; use object_store::path::Path; +use lance_namespace::error::NamespaceError; + use crate::dataset::branch_location::BranchLocation; +/// Whether `e` says the requested chain (table or branch) does not exist, as +/// opposed to a failure talking to the namespace. +fn is_chain_not_found(e: &lance_core::Error) -> bool { + if let lance_core::Error::Namespace { source, .. } = e + && let Some(ns_err) = source.downcast_ref::() + { + return matches!( + ns_err, + NamespaceError::TableNotFound { .. } | NamespaceError::TableBranchNotFound { .. } + ); + } + false +} + #[derive(Debug)] pub struct LanceNamespaceExternalManifestStore { namespace_client: Arc, @@ -90,7 +106,15 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { ..Default::default() }; - let response = self.namespace_client.list_table_versions(request).await?; + let response = match self.namespace_client.list_table_versions(request).await { + Ok(response) => response, + // A chain that does not exist yet (e.g. probing a branch location + // before the branch is created) has no latest version; the + // ExternalManifestStore contract reports that as None, not an + // error, so existence checks can treat it as a missing dataset. + Err(e) if is_chain_not_found(&e) => return Ok(None), + Err(e) => return Err(e), + }; if response.versions.is_empty() { return Ok(None); @@ -182,3 +206,93 @@ impl ExternalManifestStore for LanceNamespaceExternalManifestStore { )) } } + +#[cfg(test)] +mod tests { + use super::*; + use lance_namespace::models::ListTableVersionsResponse; + + /// A namespace whose list_table_versions always fails with the configured + /// error, to pin how get_latest_version classifies failures. + #[derive(Debug)] + struct FailingNamespace { + error: fn() -> lance_core::Error, + } + + #[async_trait] + impl LanceNamespace for FailingNamespace { + fn namespace_id(&self) -> String { + "failing".to_string() + } + + async fn list_table_versions( + &self, + _request: ListTableVersionsRequest, + ) -> Result { + Err((self.error)()) + } + } + + fn store_with(error: fn() -> lance_core::Error) -> LanceNamespaceExternalManifestStore { + LanceNamespaceExternalManifestStore::new( + Arc::new(FailingNamespace { error }), + vec!["t".to_string()], + Path::parse("data/t.lance").unwrap(), + ) + } + + /// A chain that does not exist (missing table or branch) has no latest + /// version; everything else is a real failure and must propagate so an + /// outage is never mistaken for an absent dataset. + #[tokio::test] + async fn test_get_latest_version_error_classification() { + use lance_namespace::error::NamespaceError; + + let absent = [ + store_with(|| { + NamespaceError::TableNotFound { + message: "missing table".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::TableBranchNotFound { + message: "missing branch".to_string(), + } + .into() + }), + ]; + for store in absent { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + matches!(latest, Ok(None)), + "a missing chain must read as no latest version, got: {:?}", + latest + ); + } + + let failures = [ + store_with(|| { + NamespaceError::Internal { + message: "server error".to_string(), + } + .into() + }), + store_with(|| { + NamespaceError::Throttling { + message: "slow down".to_string(), + } + .into() + }), + store_with(|| lance_core::Error::io("connection reset".to_string())), + ]; + for store in failures { + let latest = store.get_latest_version("data/t.lance/tree/dev").await; + assert!( + latest.is_err(), + "a real failure must propagate, got: {:?}", + latest + ); + } + } +} From 434682dbd5df22ec45c3b0c0ed1f73fb89adda8c Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Thu, 11 Jun 2026 18:00:46 +0000 Subject: [PATCH 089/177] chore: release beta version 8.0.0-beta.12 --- .bumpversion.toml | 2 +- Cargo.lock | 58 +++++++++++++++++++-------------------- Cargo.toml | 44 ++++++++++++++--------------- java/lance-jni/Cargo.lock | 54 +++++++++++++++++++----------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 50 ++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 109 insertions(+), 105 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 7d766a80aff..b3ca85f628e 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.11" +current_version = "8.0.0-beta.12" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 75b7f902d7a..64449a632ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1178,9 +1178,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -2809,7 +2809,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -3146,7 +3146,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4458,7 +4458,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "all_asserts", "approx", @@ -4561,7 +4561,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4609,7 +4609,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrayref", "paste", @@ -4618,7 +4618,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4658,7 +4658,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4691,7 +4691,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4711,7 +4711,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "proc-macro2", "quote", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "all_asserts", "arrow", @@ -4791,7 +4791,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4830,7 +4830,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "datafusion", "geo-traits", @@ -4844,7 +4844,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "approx", "arc-swap", @@ -4920,7 +4920,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-arith", @@ -4968,7 +4968,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "approx", "arrow-array", @@ -4987,7 +4987,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "async-trait", @@ -4999,7 +4999,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-schema", @@ -5015,7 +5015,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -5064,9 +5064,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" dependencies = [ "reqwest 0.12.28", "serde", @@ -5078,7 +5078,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -5096,7 +5096,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -5142,7 +5142,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "proc-macro2", "quote", @@ -5151,7 +5151,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-schema", @@ -5164,7 +5164,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5176,7 +5176,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 1996e2a2d57..595d1fe41d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.11", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.11", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.11", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.11", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.11", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.11", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.11", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.11", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.11", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.11", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.11", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.11", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.11", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.12", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.12", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.12", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.12", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.12", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.12", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.12", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.12", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.12", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.12", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.12", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.12", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.12", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.12", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.4" -lance-select = { version = "=8.0.0-beta.11", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.11", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.11", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.11", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.11", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.12", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.12", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.12", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.12", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.12", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.11", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.12", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = "53.0.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.11", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.12", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index f4cfc21ec9c..3cd9fc35067 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -988,9 +988,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -2298,7 +2298,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2549,7 +2549,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3749,7 +3749,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3822,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3864,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrayref", "paste", @@ -3873,7 +3873,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3911,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3943,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -3961,7 +3961,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3970,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +4005,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +4049,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arc-swap", "arrow", @@ -4116,7 +4116,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4157,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4193,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4220,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-ipc", @@ -4228,6 +4228,8 @@ dependencies = [ "async-trait", "axum", "bytes", + "datafusion-common", + "datafusion-physical-plan", "futures", "lance", "lance-core", @@ -4240,19 +4242,21 @@ dependencies = [ "object_store", "rand 0.9.4", "reqwest 0.12.28", + "roaring", "serde", "serde_json", "tokio", "tower", "tower-http 0.5.2", "url", + "uuid", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" dependencies = [ "reqwest 0.12.28", "serde", @@ -4264,7 +4268,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4279,7 +4283,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4316,7 +4320,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index f1144423c0d..035a0e0ce8c 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index e5791f8155d..3a377c35150 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.11 + 8.0.0-beta.12 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 5a6fb26be91..0be9c074402 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1112,9 +1112,9 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" dependencies = [ "hybrid-array", ] @@ -2639,7 +2639,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" dependencies = [ - "block-buffer 0.12.0", + "block-buffer 0.12.1", "const-oid 0.10.2", "crypto-common 0.2.2", "ctutils", @@ -2899,7 +2899,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4115,7 +4115,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arc-swap", "arrow", @@ -4189,7 +4189,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4231,7 +4231,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrayref", "paste", @@ -4240,7 +4240,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4310,7 +4310,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4328,7 +4328,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "proc-macro2", "quote", @@ -4337,7 +4337,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4372,7 +4372,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-arith", "arrow-array", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "datafusion", "geo-traits", @@ -4416,7 +4416,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arc-swap", "arrow", @@ -4484,7 +4484,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-arith", @@ -4525,7 +4525,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "async-trait", @@ -4552,7 +4552,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-ipc", @@ -4586,9 +4586,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04b4e5caefa132a9cce54b2d4dc95016b949b3a290a83ad5057e705df43d75be" +checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" dependencies = [ "reqwest 0.12.28", "serde", @@ -4600,7 +4600,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow-array", "arrow-buffer", @@ -4615,7 +4615,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", @@ -4654,7 +4654,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6142,7 +6142,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" dependencies = [ "arrow", "arrow-array", diff --git a/python/Cargo.toml b/python/Cargo.toml index f7d6280644a..e066e7d1077 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.11" +version = "8.0.0-beta.12" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From ddefa447a106959a8db63dcbc59e4026bd05f9e3 Mon Sep 17 00:00:00 2001 From: Rudi Floren Date: Thu, 11 Jun 2026 22:21:32 +0200 Subject: [PATCH 090/177] fix(datafusion): coerce filter literals for dictionary-encoded columns (#7003) Closes #7002 SQL string filters on a dictionary-encoded column (e.g. `Dictionary(Int16, Utf8)`) failed to plan with "could not convert to literal of type 'Dictionary(...)'" because `safe_coerce_scalar` had no arm for a dictionary target and no `ScalarValue::Dictionary` input arm. This also silently lost scalar-index pushdown for `=`/`IN` predicates on dictionary columns. Add two generic guard clauses to `safe_coerce_scalar`: unwrap a dictionary literal and coerce its inner value, and coerce a value to a dictionary target by recursing on the value type and re-wrapping. Nulls keep their untyped form, matching the existing behavior for all targets. Enabling pushdown exposed a `todo!()` in `OrderableScalarValue::cmp` for `Dictionary` vs `Dictionary` (scalar indexes store dictionary columns as `Dictionary` scalar keys in a BTreeMap), which would have turned a previously-working full-scan query into a panic. Implement the comparison by delegating to the inner values. This was created by Claude Code using Opus 4.8. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-datafusion/src/expr.rs | 108 ++++++++++++++++++++++ rust/lance-datafusion/src/logical_expr.rs | 54 +++++++++++ rust/lance-index/src/scalar/btree.rs | 33 ++++++- rust/lance/src/dataset/scanner.rs | 87 +++++++++++++++++ 4 files changed, 281 insertions(+), 1 deletion(-) diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs index 79650f6775e..a0da34ba2bb 100644 --- a/rust/lance-datafusion/src/expr.rs +++ b/rust/lance-datafusion/src/expr.rs @@ -17,6 +17,18 @@ const MS_PER_DAY: i64 = 86400000; // will always yield "x = 7_u64" regardless of the type of the column "x". As a result, we // need to do that literal coercion ourselves. pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option { + // A dictionary target coerces the value to the dictionary's value type and + // re-wraps it as a dictionary literal. Only an untyped `ScalarValue::Null` + // keeps its untyped form, matching the behavior for all other targets; a + // *typed* null (e.g. `Utf8(None)`) is coerced and wrapped like any other + // value so it produces a `Dictionary(..)` literal that matches the column. + if let DataType::Dictionary(key_type, value_type) = ty { + if matches!(value, ScalarValue::Null) { + return Some(value.clone()); + } + let inner = safe_coerce_scalar(value, value_type)?; + return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner))); + } match value { ScalarValue::Int8(val) => match ty { DataType::Int8 => Some(value.clone()), @@ -436,6 +448,9 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option Some(value.clone()), _ => None, }, + // A dictionary-encoded literal (e.g. produced by DataFusion's dictionary + // cast in the scalar-index path) coerces by unwrapping its underlying value. + ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty), _ => None, } } @@ -775,4 +790,97 @@ mod tests { Some(ScalarValue::BinaryView(Some(vec![1, 2, 3]))) ); } + + #[test] + fn test_dictionary_coerce() { + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + + // A string literal coerces to a dictionary target by wrapping the + // coerced value in a dictionary scalar. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // The inner value is coerced through to the dictionary value type, so a + // LargeUtf8 literal lands as a Utf8 value inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // A dictionary literal round-trips back to its value type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &DataType::Utf8, + ), + Some(ScalarValue::Utf8(Some("com".to_string()))) + ); + + // A dictionary literal coerces to a dictionary target, adopting the + // target's key type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &dict_ty, + ), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // An untyped null keeps its untyped form for a dictionary target, just + // like for every other target type. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Null, &dict_ty), + Some(ScalarValue::Null) + ); + + // A *typed* null (e.g. an API-built `Utf8(None)` literal, or an IN value + // already typed as Utf8) is still wrapped in the dictionary type so it + // matches the dictionary column. Returning a bare `Utf8(None)` here would + // leave `resolve_value` with a literal whose type does not line up with + // the column, breaking planning/evaluation the same way non-null strings + // used to break. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // The inner null is coerced through to the dictionary value type as well, + // so a LargeUtf8 typed null lands as a Utf8 null inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(None), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(None)), + )) + ); + + // A value that cannot be coerced to the dictionary value type fails. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Utf8(Some("com".to_string())), + &DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)), + ), + None + ); + } } diff --git a/rust/lance-datafusion/src/logical_expr.rs b/rust/lance-datafusion/src/logical_expr.rs index ab0936d31da..0eed438dae7 100644 --- a/rust/lance-datafusion/src/logical_expr.rs +++ b/rust/lance-datafusion/src/logical_expr.rs @@ -463,4 +463,58 @@ mod tests { _ => unreachable!("Expected BinaryExpr"), } } + + #[test] + fn test_resolve_typed_null_against_dictionary_column() { + // A dictionary-encoded string column, e.g. a categorical field. + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + let arrow_schema = ArrowSchema::new(vec![Field::new("etld", dict_ty, true)]); + let schema = Schema::try_from(&arrow_schema).unwrap(); + + // A typed null must be wrapped in the dictionary type, not left as a bare + // `Utf8(None)` literal sitting next to a `Dictionary(...)` column. + let expected_null = Expr::Literal( + ScalarValue::Dictionary(Box::new(DataType::Int16), Box::new(ScalarValue::Utf8(None))), + None, + ); + + // `etld = ` built directly via the API, as opposed to coming + // through SQL parsing. + let expr = Expr::BinaryExpr(BinaryExpr { + left: Box::new(Expr::Column("etld".to_string().into())), + op: Operator::Eq, + right: Box::new(Expr::Literal(ScalarValue::Utf8(None), None)), + }); + match resolve_expr(&expr, &schema).unwrap() { + Expr::BinaryExpr(be) => assert_eq!(be.right.as_ref(), &expected_null), + other => unreachable!("Expected BinaryExpr, got {other:?}"), + } + + // `etld IN ('a', )` — a typed value mixed with a typed null, + // both already typed as Utf8. Every list element is wrapped in the + // dictionary type. + let expr = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal(ScalarValue::Utf8(Some("a".to_string())), None), + Expr::Literal(ScalarValue::Utf8(None), None), + ], + false, + ); + let expected = Expr::in_list( + Expr::Column("etld".to_string().into()), + vec![ + Expr::Literal( + ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("a".to_string()))), + ), + None, + ), + expected_null, + ], + false, + ); + assert_eq!(resolve_expr(&expr, &schema).unwrap(), expected); + } } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 6128248308e..6d21e842e04 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -589,7 +589,7 @@ impl Ord for OrderableScalarValue { } } (Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"), - (Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(), + (Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())), (Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)), (Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"), // What would a btree of unions even look like? May not be possible. @@ -3310,6 +3310,37 @@ mod tests { assert!(size_of_many_i32 > 128 * 4); } + #[test] + fn test_orderable_dictionary_cmp() { + use arrow_schema::DataType; + use std::cmp::Ordering; + + let dict = |s: &str, key: DataType| { + OrderableScalarValue(ScalarValue::Dictionary( + Box::new(key), + Box::new(ScalarValue::Utf8(Some(s.to_string()))), + )) + }; + + // Dictionary scalars are ordered by their underlying value, regardless + // of the key type. This is exercised when loading a scalar index built + // on a dictionary-encoded column into a BTreeMap. + assert_eq!( + dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)), + Ordering::Less + ); + assert_eq!( + dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)), + Ordering::Equal + ); + + // A non-null dictionary value sorts after null. + assert_eq!( + dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)), + Ordering::Greater + ); + } + #[tokio::test] async fn test_null_ids() { let tmpdir = TempObjDir::default(); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 6b19150c17b..1112721bb33 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -8843,6 +8843,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ); } + /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column. + /// The dictionary cycles through "a", "b", "c" so each value appears in a + /// predictable, repeated pattern. + async fn dictionary_string_dataset() -> Dataset { + use arrow_array::{Int16Array, Int16DictionaryArray}; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "etld", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + false, + )])); + + let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let indices = Int16Array::from((0..30).map(|i| i % 3).collect::>()); + let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap(); + + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write(reader, "memory://test_dict_filter", None) + .await + .unwrap() + } + + /// Regression test for filtering a dictionary-encoded string column via the + /// SQL string path (`Scanner::filter`). This used to fail to plan with + /// "could not convert to literal of type 'Dictionary(Int16, Utf8)'". + #[tokio::test] + async fn test_filter_on_dictionary_string_column() { + let dataset = dictionary_string_dataset().await; + + // Equality predicate. + let count = dataset + .scan() + .filter("etld = 'a'") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 10); + + // IN-list predicate. + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + + /// An `IN`/`=` predicate on a dictionary column with a scalar index should be + /// pushed down to the index rather than falling back to a full scan. + #[tokio::test] + async fn test_dictionary_string_column_uses_scalar_index() { + use lance_index::scalar::BuiltinIndexType; + + let mut dataset = dictionary_string_dataset().await; + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index(&["etld"], IndexType::Scalar, None, ¶ms, true) + .await + .unwrap(); + + let mut scanner = dataset.scan(); + scanner.filter("etld IN ('a', 'b')").unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let plan_str = format!("{:?}", plan); + assert!( + plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"), + "IN on a dictionary column should use the scalar index, but got: {}", + plan_str + ); + + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + #[tokio::test] async fn test_like_prefix_with_segmented_zone_map() { use lance_index::scalar::BuiltinIndexType; From b6a99cda9eb94a5a08f9eb63d9db0613aaa5dad8 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Fri, 12 Jun 2026 12:00:31 +0800 Subject: [PATCH 091/177] perf(vector)!: add dedicated SIMD kernels for RaBitQ ex-code reranking (#7205) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem Multi-bit RaBitQ (`num_bits >= 2`) reranking computes `sum_d query[d] * ex_code[d]` per candidate. Before this change it had two slow paths: - The top-k heap pruning path did scalar bit extraction per dimension plus a gather from a `dim * 2^ex_bits` f32 table (1.5 MiB at `ex_bits=8`, `dim=1536` — cache hostile, and the table was rebuilt per query). - FastScan only covered `ex_bits` in {2, 4, 8} with extra u8 quantization error; `ex_bits` in {1, 3, 5, 6, 7} were fully scalar. ## Change Two parts, following the kernel design of the RaBitQ reference library ([VectorDB-NTU/RaBitQ-Library](https://github.com/VectorDB-NTU/RaBitQ-Library), Apache-2.0): **1. Dedicated SIMD inner-product kernels** (`vector/bq/ex_dot.rs`): unpack packed ex codes with shifts/masks and FMA them against the f32 query directly — no LUT, no quantization error. All `ex_bits` 1..=8, with scalar, AVX2, AVX-512, and NEON variants (runtime dispatch, resolved once per dist calculator). Used by the heap-pruning rerank, `distance()`, and the bulk tail; the quantized ex dist table is now only built for the FastScan widths. **2. A new on-disk ex-code layout** (`__blocked_ex_codes` column): dims are grouped into 64-dim blocks (last block zero-padded) and bit-interleaved within each block so the SIMD unpack emits codes in natural dim order (3 bits = 2-bit plane + top-bit plane, 5 = 4+1, 6 = 6|2, 7 = 6|2+1). Because unpack order is natural, the kernels read the rotated query **as stored** — there is no per-query query permutation, no load-time repack, and no extra resident copy. An earlier iteration of this PR kept the old format and paid for it on every query and partition load; benchmarks below quantify the difference. ### Compatibility - **Old indexes** (sequential `__ex_codes`) remain readable: rows are repacked once at load and the in-memory batch is normalized to the new column, so every rewrite path (remap, optimize merges, distributed shard merges — including mixed-version shards) emits the blocked format and legacy indexes upgrade on their next rewrite. - **Old lance versions cannot read new multi-bit indexes**: they fail loudly with a missing-column error (`requires __ex_codes column`) instead of misinterpreting bytes. - **`num_bits=1` indexes carry no ex codes and are unaffected in both directions.** - Binary codes, factor columns, and index metadata are unchanged. ## Benchmarks GCP `c3-standard-8` (Sapphire Rapids, AVX-512), 1024 rows per measurement. Per-candidate rerank kernel vs the previous table-gather, production embedding dims: | ex_bits | dim | kernel | table-gather | speedup | |---|---|---|---|---| | 3 | 1536 | 172.7 µs | 2.464 ms | 14.3x | | 5 | 1536 | 169.9 µs | 2.534 ms | 14.9x | | 3 | 2048 | 235.3 µs | 3.264 ms | 13.9x | | 5 | 2048 | 232.7 µs | 3.377 ms | 14.5x | All widths at dim=1536: 87.0–181.7 µs (ex_bits 8 fastest, 7 slowest); at dim=2048: 117.0–243.5 µs. Apple M-series NEON shows 9–14x over the gather at dim=1024. Storage load path (`try_from_batch`, `num_bits=4`, dim=1536, 8192 rows): | format | load time | |---|---| | blocked (new) | **572 ns** (width validation only, column aliased) | | legacy sequential (repacked) | 21.1 ms | This load-time repack and the per-query permutation were measured by an end-to-end workload as +9.65% avg latency (normal mode, `num_bits=4`, where only 0.138% of candidates reach the exact rerank); with the blocked format the query and load paths do strictly less work than before this PR. The FastScan ex artifacts are also built only when they can be consumed: the bulk LUT branch is reachable only for legacy indexes without error factors (gated indexes rerank per candidate), so fresh indexes skip the FastScan transpose (one resident copy of the ex codes plus a per-load transpose) and no path materializes the `dim * 2^ex_bits` f32 table anymore — the u8 LUT is computed directly from the rotated query, which also speeds the legacy bypass itself by 3-6%. The LUT bulk path is kept for legacy indexes because it scores 2.5x (`ex_bits=4`) to 7x (`ex_bits=2`) faster per row than the kernel loop at dim=1536 (16384-row partitions); gated indexes calling `distance_all` fall through to the exact per-row kernels. ## End-to-end benchmark (search-benchmark suite) Fresh GCP `c4-standard-16`, IVF_RQ, `num_bits=5`, partition size 4096, `nprobes=24`, `k=10`, single-threaded, gist (960d) and dbpedia (1536d), `main` @ e256207 vs this branch @ da7f2dd: | dataset | approx_mode | main avg | branch avg | latency | QPS | recall main / branch | |---|---|---|---|---|---|---| | dbpedia | fast | 3.321 ms | 3.175 ms | -4.4% | +4.6% | 0.8145 / 0.8142 | | dbpedia | normal | 3.911 ms | 3.802 ms | -2.8% | +2.5% | 0.9620 / 0.9611 | | dbpedia | accurate | 4.765 ms | 4.508 ms | -5.4% | +5.7% | 0.9661 / 0.9657 | | gist | fast | 2.347 ms | 2.259 ms | -3.8% | +4.0% | 0.6539 / 0.6577 | | gist | normal | 2.981 ms | 2.939 ms | -1.4% | +1.4% | 0.9345 / 0.9370 | | gist | accurate | 3.439 ms | 3.434 ms | -0.1% | +0.1% | 0.9422 / 0.9428 | p99 improves across the board (dbpedia accurate: -8.4%); recall and index build time are unchanged. Note this benchmark rebuilds the index per run with randomized rotation/kmeans, so single-run readings vary by up to ~10% — a pinned-core (taskset) steady-state query loop on the same machine and index gives a cleaner paired measurement: **266.0/266.6 QPS (main) vs 295.3/298.9 QPS (this branch), +12%, run-to-run spread under 0.3%** (dbpedia, normal mode). A cpu-clock profile of that loop attributes the gain directly to this PR: the exact ex rerank drops from 8.07% of query time (table gather) to 1.15% (ex-dot kernel, ~8x in vivo at an exact rerank rate of 0.14%), and the per-partition-load ex FastScan transpose (2.88%, background threads) disappears entirely. An earlier iteration of this change (kernels on top of the unchanged sequential format) measured **+9.65%** average latency on this kind of workload from the per-load repack and per-query permutation; the blocked format turns that into the gains above. ## Testing All run on GCP: Sapphire Rapids (AVX2 + AVX-512 paths) **and** Ampere Altra (NEON), plus an AddressSanitizer pass over the bq suite on x86_64 (no findings). - Differential kernel tests vs an f64 reference: every variant x `ex_bits` 1..=8 x dims {7, 16, 60, 64, 100, 128, 1024, 1536, 2048}, dense dim sweep (1..=160) for the bit-plane widths, pack/unpack round-trips, legacy-repack == writer-pack equivalence. - Storage-level reference test across all widths and **both formats** at dims 72 and 1536. - FastScan ex LUT test extended to `num_bits` {3, 5, 9} x both formats with a per-dim varying query, an unaligned dim (padded-tail LUT entries), and a quantization-derived error bound. - `remap` round-trip across widths {4, 6, 8, 9} with remap == fresh-reload equivalence; end-to-end IVF_RQ build+search+recall at `num_bits` {4, 6, 9}; distributed merger tests. - `cargo fmt --all` and `cargo clippy --all --tests --benches -- -D warnings` are clean. ## Breaking changes On-disk: multi-bit (`num_bits >= 2`) IVF_RQ indexes written by this version use the new `__blocked_ex_codes` column and are rejected (loud error) by older lance versions. Existing indexes keep working and upgrade on rewrite. `num_bits=1` is fully compatible both ways. Rust API: `RabitDistCalculator::new` and `RabitRawQueryContext` gained parameters/fields (same pattern as #7179). Python/Java bindings are unaffected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Fable 5 --- rust/lance-index/benches/rq.rs | 400 +++++- rust/lance-index/src/vector/bq.rs | 1 + rust/lance-index/src/vector/bq/builder.rs | 34 +- rust/lance-index/src/vector/bq/ex_dot.rs | 1078 +++++++++++++++++ rust/lance-index/src/vector/bq/storage.rs | 860 +++++++++---- rust/lance-index/src/vector/bq/transform.rs | 21 +- .../src/vector/distributed/index_merger.rs | 28 +- rust/lance-index/src/vector/storage.rs | 5 +- rust/lance/src/index/vector/ivf/v2.rs | 71 +- 9 files changed, 2207 insertions(+), 291 deletions(-) create mode 100644 rust/lance-index/src/vector/bq/ex_dot.rs diff --git a/rust/lance-index/benches/rq.rs b/rust/lance-index/benches/rq.rs index 4a7364d1313..e29ce9c4695 100644 --- a/rust/lance-index/benches/rq.rs +++ b/rust/lance-index/benches/rq.rs @@ -17,11 +17,16 @@ use lance_datagen::array::rand_type; use lance_datagen::{BatchGeneratorBuilder, RowCount}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{ + blocked_ex_code_bytes, ex_dot_kernel, pack_blocked_row, packed_ex_code_value, +}; use lance_index::vector::bq::storage::*; use lance_index::vector::bq::transform::{ADD_FACTORS_COLUMN, SCALE_FACTORS_COLUMN}; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::storage::{DistCalculator, VectorStore}; use lance_linalg::distance::DistanceType; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; const DIM: usize = 128; const TOTAL: usize = 16 * 1000; @@ -119,16 +124,397 @@ fn compute_distances(c: &mut Criterion) { } } -#[cfg(target_os = "linux")] -criterion_group!( - name=benches; - config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); +/// The table-gather ex distance used before the dedicated ex-dot kernels, +/// kept here as the baseline: per dim, extract the packed code and gather +/// `query[d] * code` from a `dim * 2^ex_bits` table. +fn gather_ex_distance(row_codes: &[u8], dim: usize, ex_bits: u8, ex_dist_table: &[f32]) -> f32 { + let entries_per_dim = 1usize << ex_bits; + (0..dim) + .map(|dim_idx| { + let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize; + ex_dist_table[dim_idx * entries_per_dim + code] + }) + .sum() +} + +fn ex_dot_kernels(c: &mut Criterion) { + for ex_dim in [1536usize, 2048] { + ex_dot_kernels_for_dim(c, ex_dim); + } +} + +fn ex_dot_kernels_for_dim(c: &mut Criterion, ex_dim: usize) { + const NUM_ROWS: usize = 1024; + + let mut rng = SmallRng::seed_from_u64(42); + let query = (0..ex_dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + + for ex_bits in 1..=8u8 { + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..NUM_ROWS * ex_dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + + // The gather baseline reads the legacy sequential layout it shipped + // with; the kernel reads the blocked layout. + let seq_code_len = (ex_dim * ex_bits as usize).div_ceil(8); + let mut seq_codes = vec![0u8; NUM_ROWS * seq_code_len]; + for (row, row_values) in seq_codes + .chunks_exact_mut(seq_code_len) + .zip(values.chunks_exact(ex_dim)) + { + for (dim, &value) in row_values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + row[bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + row[bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + } + + let kernel_code_len = blocked_ex_code_bytes(ex_dim, ex_bits); + let mut kernel_codes = vec![0u8; NUM_ROWS * kernel_code_len]; + for (row, row_values) in kernel_codes + .chunks_exact_mut(kernel_code_len) + .zip(values.chunks_exact(ex_dim)) + { + pack_blocked_row(row_values, ex_bits, row); + } + + // ex_dim is block-aligned here, so the kernels read the query as-is. + let ex_query = &query; + let kernel = ex_dot_kernel(ex_bits); + c.bench_function( + format!("RQ ex_dot kernel: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}").as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in kernel_codes.chunks_exact(kernel_code_len) { + sum += kernel(ex_query, row); + } + black_box(sum) + }) + }, + ); + + let entries_per_dim = 1usize << ex_bits; + let mut ex_dist_table = vec![0.0f32; ex_dim * entries_per_dim]; + for (dim, table) in ex_dist_table.chunks_exact_mut(entries_per_dim).enumerate() { + for (code, value) in table.iter_mut().enumerate() { + *value = query[dim] * code as f32; + } + } + c.bench_function( + format!("RQ ex_dot table-gather: ex_bits={ex_bits}, DIM={ex_dim}, rows={NUM_ROWS}") + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in seq_codes.chunks_exact(seq_code_len) { + sum += gather_ex_distance(row, ex_dim, ex_bits, &ex_dist_table); + } + black_box(sum) + }) + }, + ); + } +} + +/// Storage load cost per format: blocked-format ex codes are aliased as-is, +/// legacy sequential ex codes are repacked row by row. +fn ex_code_storage_load(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::bq::ex_dot::repack_sequential_row; + use lance_index::vector::bq::rabit_ex_code_bytes; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use std::sync::Arc; + + const LOAD_DIM: usize = 1536; + const LOAD_ROWS: usize = 8192; + const NUM_BITS: u8 = 4; // ex_bits=3, a bit-plane width + + let ex_bits = NUM_BITS - 1; + let mut rng = SmallRng::seed_from_u64(7); + let metadata = RabitQuantizationMetadata { + rotate_mat: None, + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Fast, + code_dim: LOAD_DIM as u32, + num_bits: NUM_BITS, + packed: true, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let code_len = LOAD_DIM / 8; + let binary_codes = (0..LOAD_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let seq_code_len = rabit_ex_code_bytes(LOAD_DIM, ex_bits).unwrap(); + let seq_codes = (0..LOAD_ROWS * seq_code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let blocked_code_len = blocked_ex_code_bytes(LOAD_DIM, ex_bits); + let mut blocked_codes = vec![0u8; LOAD_ROWS * blocked_code_len]; + for (seq_row, blocked_row) in seq_codes + .chunks_exact(seq_code_len) + .zip(blocked_codes.chunks_exact_mut(blocked_code_len)) + { + repack_sequential_row(seq_row, LOAD_DIM, ex_bits, blocked_row); + } + + let make_batch = |ex_column: &str, ex_values: Vec, ex_code_len: usize| { + arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..LOAD_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes.clone()), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + ex_column, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_values), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; LOAD_ROWS])) as ArrayRef, + ), + ]) + .unwrap() + }; + + let blocked_batch = make_batch( + RABIT_BLOCKED_EX_CODE_COLUMN, + blocked_codes, + blocked_code_len, + ); + c.bench_function( + format!("RQ storage load (blocked ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + blocked_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); + + let legacy_batch = make_batch(RABIT_EX_CODE_COLUMN, seq_codes, seq_code_len); + c.bench_function( + format!("RQ storage load (legacy ex codes): num_bits={NUM_BITS}, DIM={LOAD_DIM}, rows={LOAD_ROWS}") + .as_str(), + |b| { + b.iter(|| { + black_box( + RabitQuantizationStorage::try_from_batch( + legacy_batch.clone(), + &metadata, + DistanceType::L2, + None, + ) + .unwrap(), + ) + }) + }, + ); +} + +/// Bulk-scoring cost of the ex stage: the quantized ex-FastScan LUT path +/// (inside `distance_all`) vs the exact per-row ex-dot kernel. The +/// binary-only run isolates the shared binary stage so the ex cost is the +/// difference from the full run. +fn ex_bulk_paths(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::ex_dot::pad_query_into; + use lance_index::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::sync::Arc; + + const BULK_DIM: usize = 1536; + const BULK_ROWS: usize = 16384; + + let mut rng = SmallRng::seed_from_u64(13); + for num_bits in [3u8, 5, 9] { + let ex_bits = num_bits - 1; + let max_code = ((1u16 << ex_bits) - 1) as u8; + + let rq = RabitQuantizer::new_with_rotation::( + num_bits, + BULK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = BULK_DIM / 8; + let binary_codes = (0..BULK_ROWS * code_len) + .map(|_| rng.random_range(0..=u8::MAX)) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(BULK_DIM, ex_bits); + let mut ex_codes = vec![0u8; BULK_ROWS * ex_code_len]; + let values = (0..BULK_DIM) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + for row in ex_codes.chunks_exact_mut(ex_code_len) { + pack_blocked_row(&values, ex_bits, row); + } + + // No error factors: `distance_all` takes the FastScan ex bulk branch. + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..BULK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes.clone()), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0f32; BULK_ROWS])) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![1.0f32; BULK_ROWS])) as ArrayRef, + ), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + + let query: ArrayRef = Arc::new(Float32Array::from( + (0..BULK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("full distance_all (binary + ex LUT)", ApproxMode::Normal), + ("binary-only distance_all (fast mode)", ApproxMode::Fast), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 0.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!("RQ bulk {label}: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}") + .as_str(), + |b| { + b.iter(|| { + calc.distance_all_with_scratch( + 0, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(dists.len()) + }) + }, + ); + } + + let kernel = ex_dot_kernel(ex_bits); + let mut ex_query = vec![0.0f32; BULK_DIM]; + pad_query_into( + query + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &mut ex_query, + ); + c.bench_function( + format!( + "RQ bulk ex kernel loop: num_bits={num_bits}, DIM={BULK_DIM}, rows={BULK_ROWS}" + ) + .as_str(), + |b| { + b.iter(|| { + let mut sum = 0.0f32; + for row in ex_codes.chunks_exact(ex_code_len) { + sum += kernel(&ex_query, row); + } + black_box(sum) + }) + }, + ); + } +} -#[cfg(not(target_os = "linux"))] criterion_group!( name=benches; config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances); + targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths); criterion_main!(benches); diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index 0fdd918edab..71c4eed7fd8 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -18,6 +18,7 @@ use crate::vector::bq::storage::RabitQuantizationMetadata; use crate::vector::quantizer::QuantizerBuildParams; pub mod builder; +pub mod ex_dot; pub mod rotation; pub mod storage; pub mod transform; diff --git a/rust/lance-index/src/vector/bq/builder.rs b/rust/lance-index/src/vector/bq/builder.rs index 178a6bb5435..9eb7fc76903 100644 --- a/rust/lance-index/src/vector/bq/builder.rs +++ b/rust/lance-index/src/vector/bq/builder.rs @@ -25,7 +25,7 @@ use crate::vector::bq::transform::{ SCALE_FACTORS_FIELD, }; use crate::vector::bq::{ - RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rabit_ex_code_bytes, + RQBuildParams, RQRotationType, rabit_binary_code_bytes, rabit_ex_bits, rotation::{apply_fast_rotation, fast_rotation_signs_len, random_fast_rotation_signs}, validate_rq_num_bits, }; @@ -78,21 +78,6 @@ fn pack_sign_bits(codes: &mut [u8], rotated: &[f32]) { } } -#[inline] -fn pack_ex_code_bits(codes: &mut [u8], ex_values: &[u8], ex_bits: u8) { - codes.fill(0); - let ex_bits = ex_bits as usize; - for (dim_idx, &value) in ex_values.iter().enumerate() { - let bit_offset = dim_idx * ex_bits; - for bit_idx in 0..ex_bits { - if (value >> bit_idx) & 1 != 0 { - let dst_bit = bit_offset + bit_idx; - codes[dst_bit / u8::BITS as usize] |= 1u8 << (dst_bit % u8::BITS as usize); - } - } - } -} - const EX_QUANTIZATION_EPSILON: f32 = 1.0e-5; const EX_TIGHT_START: [f32; 9] = [0.0, 0.15, 0.20, 0.52, 0.59, 0.71, 0.75, 0.77, 0.81]; @@ -200,7 +185,7 @@ fn quantize_ex_code( *ex_code_value = ex_code; } - pack_ex_code_bits(ex_code_dst, ex_code_values_dst, ex_bits); + crate::vector::bq::ex_dot::pack_blocked_row(ex_code_values_dst, ex_bits, ex_code_dst); residual_dot_code } @@ -599,7 +584,11 @@ impl RabitQuantizer { .as_slice(); let code_dim = self.code_dim(); let code_bytes = rabit_binary_code_bytes(code_dim); - let ex_code_bytes = rabit_ex_code_bytes(code_dim, ex_bits)?; + let ex_code_bytes = if ex_bits == 0 { + 0 + } else { + crate::vector::bq::ex_dot::blocked_ex_code_bytes(code_dim, ex_bits) + }; let mut encoded_codes = vec![0u8; n * code_bytes]; let mut encoded_ex_codes = (ex_bits != 0).then(|| vec![0u8; n * ex_code_bytes]); @@ -901,7 +890,7 @@ mod tests { use lance_linalg::distance::DistanceType; use rstest::rstest; - use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN; #[rstest] #[case(8)] @@ -978,14 +967,14 @@ mod tests { assert!( !fields .iter() - .any(|field| field.name() == RABIT_EX_CODE_COLUMN) + .any(|field| field.name() == RABIT_BLOCKED_EX_CODE_COLUMN) ); let q = RabitQuantizer::new_with_rotation::(3, 128, RQRotationType::Fast); let fields = q.extra_fields(); for expected in [ ERROR_FACTORS_FIELD.name().as_str(), - RABIT_EX_CODE_COLUMN, + RABIT_BLOCKED_EX_CODE_COLUMN, EX_ADD_FACTORS_FIELD.name().as_str(), EX_SCALE_FACTORS_FIELD.name().as_str(), ] { @@ -1095,7 +1084,8 @@ mod tests { .unwrap() .as_fixed_size_list() .value_length(), - 32 + // dim=32 is padded to one 64-dim block at ex_bits=8. + 64 ); } diff --git a/rust/lance-index/src/vector/bq/ex_dot.rs b/rust/lance-index/src/vector/bq/ex_dot.rs new file mode 100644 index 00000000000..1aeb83ba40c --- /dev/null +++ b/rust/lance-index/src/vector/bq/ex_dot.rs @@ -0,0 +1,1078 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Inner-product kernels between an `f32` query and bit-packed RaBitQ ex-codes. +//! +//! Multi-bit RaBitQ reranking reduces to `sum_d query[d] * ex_code[d]`, where +//! `ex_code[d]` is an unsigned `ex_bits`-wide integer. Materializing a +//! `dim * 2^ex_bits` lookup table and gathering one entry per dimension is +//! cache-hostile (the table is 1MiB for `ex_bits=8`, `dim=1024`); these kernels +//! instead unpack the codes with shifts and masks and FMA them against the +//! query directly, following the kernel design of the RaBitQ reference library +//! (, Apache-2.0). +//! +//! Codes are stored in the *blocked* layout: dims are grouped into 64-dim +//! blocks (the last block zero-padded) and bit-interleaved within each block +//! so that the SIMD unpack emits codes in natural dim order: +//! +//! ```text +//! per 64-dim block (T = ex_bits - 1, the top bit; "run k" = dims 16k..16k+16): +//! 1 bit: [8B] bit i of the LE word = dim i +//! 2 bits: [16B] byte b = dims {b, b+16, b+32, b+48} at bit pairs 0/2/4/6 +//! 3 bits: [16B 2-bit plane as above][8B top-bit plane] +//! 4 bits: [32B] byte 8j+b = dim 16j+b (low nibble) | dim 16j+8+b (high nibble) +//! 5 bits: [32B 4-bit plane: byte b = dims b|b+16; byte 16+b = dims b+32|b+48] +//! [8B top-bit plane] +//! 6 bits: [48B] byte 16k+b = dim 16k+b (6 low bits) | bits 2k..2k+2 of +//! dim 48+b (2 high bits) +//! 7 bits: [48B as 6 bits][8B top-bit plane] +//! 8 bits: [64B] identity +//! top-bit plane: top bit of dim 16k+b at bit 8*(b%8) + 2k + b/8 of a LE u64 +//! ``` +//! +//! Because unpack order is natural, the kernels read the rotated query +//! directly; it only needs zero-padding ([`pad_query_into`]) when the rotated +//! dim is not a multiple of 64. Legacy indexes store ex codes sequentially +//! (LSB-first bit stream) and are repacked once at load time +//! ([`repack_sequential_row`]); for `ex_bits` ∈ {1, 8} the two layouts agree +//! (modulo trailing padding, which the kernels tolerate) and rows are used as +//! stored. + +use std::sync::LazyLock; + +/// Dims are packed in blocks of this size; the query is zero-padded to a +/// whole number of blocks when the rotated dim is not already a multiple. +pub const EX_DOT_BLOCK_DIMS: usize = 64; + +/// `f32` length of the query consumed by the kernels. +pub fn padded_query_len(dim: usize) -> usize { + dim.next_multiple_of(EX_DOT_BLOCK_DIMS) +} + +/// Whether the legacy sequential layout of a row already matches the blocked +/// layout (modulo trailing zero padding, which the kernels tolerate), so +/// legacy rows can be consumed without repacking. +pub fn sequential_matches_blocked(ex_bits: u8) -> bool { + matches!(ex_bits, 1 | 8) +} + +/// Bytes per row of the blocked ex-code layout. +pub fn blocked_ex_code_bytes(dim: usize, ex_bits: u8) -> usize { + debug_assert!((1..=8).contains(&ex_bits)); + padded_query_len(dim) * ex_bits as usize / 8 +} + +/// Dimensions per unpacking group for the given code width. +fn group_dims(ex_bits: u8) -> usize { + match ex_bits { + 1 | 4 | 8 => 16, + _ => EX_DOT_BLOCK_DIMS, + } +} + +fn group_bytes(ex_bits: u8) -> usize { + group_dims(ex_bits) * ex_bits as usize / 8 +} + +/// Extract the `ex_bits`-wide code of `dim_idx` from a sequentially bit-packed +/// row (LSB-first, codes may straddle byte boundaries). +#[inline] +pub fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 { + debug_assert!(ex_bits > 0); + let bit_offset = dim_idx * ex_bits as usize; + let byte_idx = bit_offset / u8::BITS as usize; + let bit_shift = bit_offset % u8::BITS as usize; + let bits = row_codes[byte_idx] as u16 + | row_codes + .get(byte_idx + 1) + .map(|byte| (*byte as u16) << u8::BITS) + .unwrap_or_default(); + let mask = (1u16 << ex_bits) - 1; + ((bits >> bit_shift) & mask) as u8 +} + +/// Zero-pad the rotated query to a whole number of 64-dim blocks. Only needed +/// when `dim` is not a multiple of [`EX_DOT_BLOCK_DIMS`]; aligned queries are +/// passed to the kernels as-is. +pub fn pad_query_into(rotated_query: &[f32], out: &mut [f32]) { + debug_assert_eq!(out.len(), padded_query_len(rotated_query.len())); + out[..rotated_query.len()].copy_from_slice(rotated_query); + out[rotated_query.len()..].fill(0.0); +} + +/// Pack the top bit of each of 64 codes into a `u64` so kernels can position +/// it with two shifts per 16-code run: the top bit of dim `16k + b` is stored +/// at bit `8 * (b % 8) + 2k + b / 8`. +fn pack_top_plane(block_values: &[u8; 64], top_bit: u8) -> u64 { + let mut plane = 0u64; + for k in 0..4 { + for b in 0..16 { + let bit = (block_values[16 * k + b] >> top_bit) & 1; + plane |= (bit as u64) << (8 * (b % 8) + 2 * k + b / 8); + } + } + plane +} + +/// Shift `plane` so that its bit `8j + from_bit` lands at bit `8j + to_bit`. +#[inline(always)] +fn shift_plane(plane: u64, from_bit: usize, to_bit: usize) -> u64 { + if from_bit >= to_bit { + plane >> (from_bit - to_bit) + } else { + plane << (to_bit - from_bit) + } +} + +/// Pack one block of 64 code values (natural dim order) into the blocked +/// layout described in the module docs. +fn pack_block(ex_bits: u8, block_values: &[u8; 64], out: &mut [u8]) { + let v = block_values; + match ex_bits { + 1 => { + for (b, byte) in out[..8].iter_mut().enumerate() { + *byte = (0..8).fold(0, |acc, t| acc | ((v[8 * b + t] & 1) << t)); + } + } + 2 | 3 => { + for b in 0..16 { + out[b] = (v[b] & 0b11) + | ((v[16 + b] & 0b11) << 2) + | ((v[32 + b] & 0b11) << 4) + | ((v[48 + b] & 0b11) << 6); + } + if ex_bits == 3 { + out[16..24].copy_from_slice(&pack_top_plane(v, 2).to_le_bytes()); + } + } + 4 => { + for unit in 0..4 { + for b in 0..8 { + out[8 * unit + b] = + (v[16 * unit + b] & 0x0f) | ((v[16 * unit + 8 + b] & 0x0f) << 4); + } + } + } + 5 => { + for b in 0..16 { + out[b] = (v[b] & 0x0f) | ((v[16 + b] & 0x0f) << 4); + out[16 + b] = (v[32 + b] & 0x0f) | ((v[48 + b] & 0x0f) << 4); + } + out[32..40].copy_from_slice(&pack_top_plane(v, 4).to_le_bytes()); + } + 6 | 7 => { + // Runs 0..3 keep their 6 low bits in place; the fourth run's dims + // are split into three 2-bit pieces stored in the runs' top bits. + for k in 0..3 { + for b in 0..16 { + out[16 * k + b] = + (v[16 * k + b] & 0x3f) | (((v[48 + b] >> (2 * k)) & 0b11) << 6); + } + } + if ex_bits == 7 { + out[48..56].copy_from_slice(&pack_top_plane(v, 6).to_le_bytes()); + } + } + 8 => out[..64].copy_from_slice(v), + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +/// Pack one row of unpacked code values (one `u8` per dim) into the blocked +/// layout; the writer path. `out` must have [`blocked_ex_code_bytes`] bytes. +pub fn pack_blocked_row(values: &[u8], ex_bits: u8, out: &mut [u8]) { + debug_assert_eq!(out.len(), blocked_ex_code_bytes(values.len(), ex_bits)); + let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8; + let mut block_values = [0u8; 64]; + for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() { + let base = block * EX_DOT_BLOCK_DIMS; + let count = EX_DOT_BLOCK_DIMS.min(values.len() - base); + block_values[..count].copy_from_slice(&values[base..base + count]); + block_values[count..].fill(0); + pack_block(ex_bits, &block_values, out); + } +} + +/// Repack one legacy sequentially bit-packed row into the blocked layout. +/// `out` must have [`blocked_ex_code_bytes`] bytes. +pub fn repack_sequential_row(seq_row: &[u8], dim: usize, ex_bits: u8, out: &mut [u8]) { + debug_assert_eq!(out.len(), blocked_ex_code_bytes(dim, ex_bits)); + let block_bytes = EX_DOT_BLOCK_DIMS * ex_bits as usize / 8; + let mut block_values = [0u8; 64]; + for (block, out) in out.chunks_exact_mut(block_bytes).enumerate() { + block_values.fill(0); + let base = block * EX_DOT_BLOCK_DIMS; + let count = EX_DOT_BLOCK_DIMS.min(dim.saturating_sub(base)); + for (i, value) in block_values[..count].iter_mut().enumerate() { + *value = packed_ex_code_value(seq_row, base + i, ex_bits); + } + pack_block(ex_bits, &block_values, out); + } +} + +/// Unpack one code group into per-dim values (natural dim order). Reference +/// implementation for the SIMD unpackers; also the scalar fallback. +fn unpack_group(ex_bits: u8, group_codes: &[u8], out: &mut [u8; 64]) { + debug_assert_eq!(group_codes.len(), group_bytes(ex_bits)); + match ex_bits { + 1 => { + for (i, value) in out[..16].iter_mut().enumerate() { + *value = (group_codes[i / 8] >> (i % 8)) & 1; + } + } + 2 => { + for k in 0..4 { + for b in 0..16 { + out[16 * k + b] = (group_codes[b] >> (2 * k)) & 0b11; + } + } + } + 3 => { + let plane = u64::from_le_bytes(group_codes[16..24].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] = ((group_codes[b] >> (2 * k)) & 0b11) | ((top as u8) << 2); + } + } + } + 4 => { + for b in 0..8 { + out[b] = group_codes[b] & 0x0f; + out[8 + b] = group_codes[b] >> 4; + } + } + 5 => { + let plane = u64::from_le_bytes(group_codes[32..40].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let nibble = (group_codes[16 * (k / 2) + b] >> (4 * (k % 2))) & 0x0f; + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] = nibble | ((top as u8) << 4); + } + } + } + 6 | 7 => { + for k in 0..3 { + for b in 0..16 { + out[16 * k + b] = group_codes[16 * k + b] & 0x3f; + } + } + for b in 0..16 { + out[48 + b] = (group_codes[b] >> 6) + | ((group_codes[16 + b] >> 6) << 2) + | ((group_codes[32 + b] >> 6) << 4); + } + if ex_bits == 7 { + let plane = u64::from_le_bytes(group_codes[48..56].try_into().unwrap()); + for k in 0..4 { + for b in 0..16 { + let top = (plane >> (8 * (b % 8) + 2 * k + b / 8)) & 1; + out[16 * k + b] |= (top as u8) << 6; + } + } + } + } + 8 => out[..16].copy_from_slice(group_codes), + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +/// `sum_d query[d] * code[d]` for one row of blocked-layout codes. +/// +/// The query must cover a whole number of 64-dim blocks (the rotated query +/// as-is for aligned dims, otherwise zero-padded via [`pad_query_into`]); +/// `codes` is the blocked row slice. Rows shorter than the padded query +/// length are treated as zero-padded. +pub type ExDotFn = fn(&[f32], &[u8]) -> f32; + +/// Resolve the dot kernel for `ex_bits` once; the result can be cached by the +/// caller for per-candidate use. +pub fn ex_dot_kernel(ex_bits: u8) -> ExDotFn { + debug_assert!((1..=8).contains(&ex_bits)); + static KERNELS: LazyLock<[ExDotFn; 8]> = + LazyLock::new(|| std::array::from_fn(|i| select_ex_dot_kernel(i as u8 + 1))); + KERNELS[usize::from(ex_bits) - 1] +} + +fn select_ex_dot_kernel(ex_bits: u8) -> ExDotFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::avx512_kernel(ex_bits); + } + if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma") + { + return x86::avx2_kernel(ex_bits); + } + } + #[cfg(target_arch = "aarch64")] + { + // NEON is part of the aarch64 baseline. + return neon::kernel(ex_bits); + } + #[allow(unreachable_code)] + scalar_kernel(ex_bits) +} + +fn scalar_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => ex_dot_scalar::<1>, + 2 => ex_dot_scalar::<2>, + 3 => ex_dot_scalar::<3>, + 4 => ex_dot_scalar::<4>, + 5 => ex_dot_scalar::<5>, + 6 => ex_dot_scalar::<6>, + 7 => ex_dot_scalar::<7>, + 8 => ex_dot_scalar::<8>, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } +} + +fn ex_dot_scalar(ex_query: &[f32], codes: &[u8]) -> f32 { + let group_dims = group_dims(EX_BITS); + let bytes_per_group = group_bytes(EX_BITS); + debug_assert_eq!(ex_query.len() % EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * u8::BITS as usize <= ex_query.len() * EX_BITS as usize); + + let mut sum = 0.0f32; + let mut unpacked = [0u8; 64]; + let mut padded = [0u8; 56]; + for (group, query) in ex_query.chunks_exact(group_dims).enumerate() { + let start = group * bytes_per_group; + if start >= codes.len() { + // The remaining query lanes are zero padding. + break; + } + let group_codes = if start + bytes_per_group <= codes.len() { + &codes[start..start + bytes_per_group] + } else { + let avail = codes.len() - start; + padded[..bytes_per_group].fill(0); + padded[..avail].copy_from_slice(&codes[start..]); + &padded[..bytes_per_group] + }; + unpack_group(EX_BITS, group_codes, &mut unpacked); + for (q, &code) in query.iter().zip(unpacked[..group_dims].iter()) { + sum += q * code as f32; + } + } + sum +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use super::ExDotFn; + use std::arch::x86_64::*; + + pub(super) fn avx2_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_avx2_dispatch, + 2 => dot_u2_avx2_dispatch, + 3 => dot_u3_avx2_dispatch, + 4 => dot_u4_avx2_dispatch, + 5 => dot_u5_avx2_dispatch, + 6 => dot_u6_avx2_dispatch, + 7 => dot_u7_avx2_dispatch, + 8 => dot_u8_avx2_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + pub(super) fn avx512_kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_avx512_dispatch, + 2 => dot_u2_avx512_dispatch, + 3 => dot_u3_avx512_dispatch, + 4 => dot_u4_avx512_dispatch, + 5 => dot_u5_avx512_dispatch, + 6 => dot_u6_avx512_dispatch, + 7 => dot_u7_avx512_dispatch, + 8 => dot_u8_avx512_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + /// Broadcast a byte to the 8 bytes of a `u64`. + #[inline(always)] + fn splat_byte(byte: u8) -> u64 { + byte as u64 * 0x0101_0101_0101_0101 + } + + // Unpack helpers. They read exactly one group of code bytes and return + // runs of 16 codes matching the kernel-order query. Only SSE2 (baseline on + // x86_64) is required. + + /// 16 1-bit codes from 2 bytes: compare each replicated byte against + /// per-lane bit masks to turn set bits into 0/1 bytes. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u1(ptr: *const u8) -> [__m128i; 1] { + let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) }; + let bytes = _mm_set_epi64x(splat_byte(b1) as i64, splat_byte(b0) as i64); + let bit_select = _mm_set1_epi64x(0x8040_2010_0804_0201u64 as i64); + let selected = _mm_cmpeq_epi8(_mm_and_si128(bytes, bit_select), bit_select); + [_mm_and_si128(selected, _mm_set1_epi8(1))] + } + + /// 64 2-bit codes from 16 bytes: byte b holds dims 4b..4b+3 at bit pairs. + /// The 16-bit shifts drag bits across byte boundaries, which the per-byte + /// mask removes. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u2(ptr: *const u8) -> [__m128i; 4] { + let raw = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let mask = _mm_set1_epi8(0b11); + [ + _mm_and_si128(raw, mask), + _mm_and_si128(_mm_srli_epi16::<2>(raw), mask), + _mm_and_si128(_mm_srli_epi16::<4>(raw), mask), + _mm_and_si128(_mm_srli_epi16::<6>(raw), mask), + ] + } + + /// Position the top-bit plane (see [`super::pack_top_plane`]) of run `k` + /// at `top_bit` within each byte. + #[inline] + #[target_feature(enable = "sse2")] + fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> __m128i { + let lo = super::shift_plane(plane, 2 * k, top_bit); + let hi = super::shift_plane(plane, 2 * k + 1, top_bit); + _mm_and_si128( + _mm_set_epi64x(hi as i64, lo as i64), + _mm_set1_epi8(1 << top_bit), + ) + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u3(ptr: *const u8) -> [__m128i; 4] { + let mut runs = unsafe { unpack_u2(ptr) }; + let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 2)); + } + runs + } + + /// 16 4-bit codes from 8 bytes: low nibbles are the even dims, high + /// nibbles the odd dims. + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u4(ptr: *const u8) -> [__m128i; 1] { + let word = unsafe { (ptr as *const u64).read_unaligned() }; + let mask = 0x0f0f_0f0f_0f0f_0f0fu64; + [_mm_set_epi64x( + ((word >> 4) & mask) as i64, + (word & mask) as i64, + )] + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u5(ptr: *const u8) -> [__m128i; 4] { + let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() }; + let mask = _mm_set1_epi8(0x0f); + let mut runs = [ + _mm_and_si128(blk0, mask), + _mm_and_si128(_mm_srli_epi16::<4>(blk0), mask), + _mm_and_si128(blk1, mask), + _mm_and_si128(_mm_srli_epi16::<4>(blk1), mask), + ]; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 4)); + } + runs + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u6(ptr: *const u8) -> [__m128i; 4] { + let blk0 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + let blk1 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + let blk2 = unsafe { _mm_loadu_si128(ptr.add(32) as *const __m128i) }; + let mask6 = _mm_set1_epi8(0x3f); + let mask2 = _mm_set1_epi8(0b1100_0000u8 as i8); + let stolen = _mm_or_si128( + _mm_or_si128( + _mm_srli_epi16::<6>(_mm_and_si128(blk0, mask2)), + _mm_srli_epi16::<4>(_mm_and_si128(blk1, mask2)), + ), + _mm_srli_epi16::<2>(_mm_and_si128(blk2, mask2)), + ); + [ + _mm_and_si128(blk0, mask6), + _mm_and_si128(blk1, mask6), + _mm_and_si128(blk2, mask6), + stolen, + ] + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u7(ptr: *const u8) -> [__m128i; 4] { + let mut runs = unsafe { unpack_u6(ptr) }; + let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = _mm_or_si128(*run, top_plane_run(plane, k, 6)); + } + runs + } + + #[inline] + #[target_feature(enable = "sse2")] + unsafe fn unpack_u8x16(ptr: *const u8) -> [__m128i; 1] { + [unsafe { _mm_loadu_si128(ptr as *const __m128i) }] + } + + /// FMA 16 code bytes against 16 query floats (AVX2: two 8-float halves). + #[inline] + #[target_feature(enable = "avx2", enable = "fma")] + unsafe fn fma16_avx2(codes: __m128i, query: *const f32, acc: &mut [__m256; 2]) { + let lo = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(codes)); + acc[0] = _mm256_fmadd_ps(lo, unsafe { _mm256_loadu_ps(query) }, acc[0]); + let hi = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_mm_srli_si128::<8>(codes))); + acc[1] = _mm256_fmadd_ps(hi, unsafe { _mm256_loadu_ps(query.add(8)) }, acc[1]); + } + + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn reduce_add_avx2(acc: [__m256; 2]) -> f32 { + let v = _mm256_add_ps(acc[0], acc[1]); + let halves = _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_add_ps(halves, _mm_movehl_ps(halves, halves)); + let total = _mm_add_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(total) + } + + /// FMA 16 code bytes against 16 query floats (AVX-512: one 16-float lane). + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn fma16_avx512(codes: __m128i, query: *const f32, acc: &mut __m512) { + let values = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(codes)); + *acc = _mm512_fmadd_ps(values, unsafe { _mm512_loadu_ps(query) }, *acc); + } + + macro_rules! x86_dot_kernel { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "avx2", enable = "fma")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + // Two accumulators per run position break the FMA latency + // chain; they are summed once at the end. + let mut acc = [_mm256_setzero_ps(); 2]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx2( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + // Zero-pad the final partial code group on the stack. + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx2( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + unsafe { reduce_add_avx2(acc) } + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: only selected when AVX2 and FMA were detected. + unsafe { $name(ex_query, codes) } + } + }; + } + + macro_rules! x86_dot_kernel_avx512 { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "avx512f")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + // Alternating by group as well as run keeps two independent + // FMA chains even for the single-run widths. + let mut acc = [_mm512_setzero_ps(); 2]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx512( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc[(group + run) % 2], + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_avx512( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc[(full_groups + run) % 2], + ) + }; + } + } + _mm512_reduce_add_ps(_mm512_add_ps(acc[0], acc[1])) + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: only selected when AVX-512F was detected. + unsafe { $name(ex_query, codes) } + } + }; + } + + x86_dot_kernel!(dot_u1_avx2, dot_u1_avx2_dispatch, unpack_u1, 1, 1); + x86_dot_kernel!(dot_u2_avx2, dot_u2_avx2_dispatch, unpack_u2, 2, 4); + x86_dot_kernel!(dot_u3_avx2, dot_u3_avx2_dispatch, unpack_u3, 3, 4); + x86_dot_kernel!(dot_u4_avx2, dot_u4_avx2_dispatch, unpack_u4, 4, 1); + x86_dot_kernel!(dot_u5_avx2, dot_u5_avx2_dispatch, unpack_u5, 5, 4); + x86_dot_kernel!(dot_u6_avx2, dot_u6_avx2_dispatch, unpack_u6, 6, 4); + x86_dot_kernel!(dot_u7_avx2, dot_u7_avx2_dispatch, unpack_u7, 7, 4); + x86_dot_kernel!(dot_u8_avx2, dot_u8_avx2_dispatch, unpack_u8x16, 8, 1); + + x86_dot_kernel_avx512!(dot_u1_avx512, dot_u1_avx512_dispatch, unpack_u1, 1, 1); + x86_dot_kernel_avx512!(dot_u2_avx512, dot_u2_avx512_dispatch, unpack_u2, 2, 4); + x86_dot_kernel_avx512!(dot_u3_avx512, dot_u3_avx512_dispatch, unpack_u3, 3, 4); + x86_dot_kernel_avx512!(dot_u4_avx512, dot_u4_avx512_dispatch, unpack_u4, 4, 1); + x86_dot_kernel_avx512!(dot_u5_avx512, dot_u5_avx512_dispatch, unpack_u5, 5, 4); + x86_dot_kernel_avx512!(dot_u6_avx512, dot_u6_avx512_dispatch, unpack_u6, 6, 4); + x86_dot_kernel_avx512!(dot_u7_avx512, dot_u7_avx512_dispatch, unpack_u7, 7, 4); + x86_dot_kernel_avx512!(dot_u8_avx512, dot_u8_avx512_dispatch, unpack_u8x16, 8, 1); +} + +#[cfg(target_arch = "aarch64")] +mod neon { + use super::ExDotFn; + use std::arch::aarch64::*; + + pub(super) fn kernel(ex_bits: u8) -> ExDotFn { + match ex_bits { + 1 => dot_u1_neon_dispatch, + 2 => dot_u2_neon_dispatch, + 3 => dot_u3_neon_dispatch, + 4 => dot_u4_neon_dispatch, + 5 => dot_u5_neon_dispatch, + 6 => dot_u6_neon_dispatch, + 7 => dot_u7_neon_dispatch, + 8 => dot_u8_neon_dispatch, + _ => unreachable!("invalid RabitQ ex_bits={ex_bits}"), + } + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u1(ptr: *const u8) -> [uint8x16_t; 1] { + let (b0, b1) = unsafe { (ptr.read(), ptr.add(1).read()) }; + let bytes = vcombine_u8(vdup_n_u8(b0), vdup_n_u8(b1)); + let bit_select = vreinterpretq_u8_u64(vdupq_n_u64(0x8040_2010_0804_0201)); + [vandq_u8(vtstq_u8(bytes, bit_select), vdupq_n_u8(1))] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u2(ptr: *const u8) -> [uint8x16_t; 4] { + let raw = unsafe { vld1q_u8(ptr) }; + let mask = vdupq_n_u8(0b11); + [ + vandq_u8(raw, mask), + vandq_u8(vshrq_n_u8::<2>(raw), mask), + vandq_u8(vshrq_n_u8::<4>(raw), mask), + vshrq_n_u8::<6>(raw), + ] + } + + #[inline] + #[target_feature(enable = "neon")] + fn top_plane_run(plane: u64, k: usize, top_bit: usize) -> uint8x16_t { + let lo = super::shift_plane(plane, 2 * k, top_bit); + let hi = super::shift_plane(plane, 2 * k + 1, top_bit); + vandq_u8( + vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(lo), vcreate_u64(hi))), + vdupq_n_u8(1 << top_bit), + ) + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u3(ptr: *const u8) -> [uint8x16_t; 4] { + let mut runs = unsafe { unpack_u2(ptr) }; + let plane = unsafe { (ptr.add(16) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 2)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u4(ptr: *const u8) -> [uint8x16_t; 1] { + let word = unsafe { (ptr as *const u64).read_unaligned() }; + let mask = 0x0f0f_0f0f_0f0f_0f0fu64; + [vreinterpretq_u8_u64(vcombine_u64( + vcreate_u64(word & mask), + vcreate_u64((word >> 4) & mask), + ))] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u5(ptr: *const u8) -> [uint8x16_t; 4] { + let blk0 = unsafe { vld1q_u8(ptr) }; + let blk1 = unsafe { vld1q_u8(ptr.add(16)) }; + let plane = unsafe { (ptr.add(32) as *const u64).read_unaligned() }; + let mask = vdupq_n_u8(0x0f); + let mut runs = [ + vandq_u8(blk0, mask), + vshrq_n_u8::<4>(blk0), + vandq_u8(blk1, mask), + vshrq_n_u8::<4>(blk1), + ]; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 4)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u6(ptr: *const u8) -> [uint8x16_t; 4] { + let blk0 = unsafe { vld1q_u8(ptr) }; + let blk1 = unsafe { vld1q_u8(ptr.add(16)) }; + let blk2 = unsafe { vld1q_u8(ptr.add(32)) }; + let mask6 = vdupq_n_u8(0x3f); + let stolen = vorrq_u8( + vorrq_u8( + vshrq_n_u8::<6>(blk0), + vshlq_n_u8::<2>(vshrq_n_u8::<6>(blk1)), + ), + vshlq_n_u8::<4>(vshrq_n_u8::<6>(blk2)), + ); + [ + vandq_u8(blk0, mask6), + vandq_u8(blk1, mask6), + vandq_u8(blk2, mask6), + stolen, + ] + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u7(ptr: *const u8) -> [uint8x16_t; 4] { + let mut runs = unsafe { unpack_u6(ptr) }; + let plane = unsafe { (ptr.add(48) as *const u64).read_unaligned() }; + for (k, run) in runs.iter_mut().enumerate() { + *run = vorrq_u8(*run, top_plane_run(plane, k, 6)); + } + runs + } + + #[inline] + #[target_feature(enable = "neon")] + unsafe fn unpack_u8x16(ptr: *const u8) -> [uint8x16_t; 1] { + [unsafe { vld1q_u8(ptr) }] + } + + /// FMA 16 code bytes against 16 query floats over four 4-float lanes. + #[inline] + #[target_feature(enable = "neon")] + unsafe fn fma16_neon(codes: uint8x16_t, query: *const f32, acc: &mut [float32x4_t; 4]) { + let lo = vmovl_u8(vget_low_u8(codes)); + let hi = vmovl_u8(vget_high_u8(codes)); + let c0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo))); + let c1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo))); + let c2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi))); + let c3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi))); + unsafe { + acc[0] = vfmaq_f32(acc[0], c0, vld1q_f32(query)); + acc[1] = vfmaq_f32(acc[1], c1, vld1q_f32(query.add(4))); + acc[2] = vfmaq_f32(acc[2], c2, vld1q_f32(query.add(8))); + acc[3] = vfmaq_f32(acc[3], c3, vld1q_f32(query.add(12))); + } + } + + macro_rules! neon_dot_kernel { + ($name:ident, $dispatch:ident, $unpack:ident, $ex_bits:expr, $runs:expr) => { + #[target_feature(enable = "neon")] + unsafe fn $name(ex_query: &[f32], codes: &[u8]) -> f32 { + const GROUP_DIMS: usize = if $runs == 1 { 16 } else { 64 }; + const GROUP_BYTES: usize = GROUP_DIMS * $ex_bits / 8; + debug_assert_eq!(ex_query.len() % super::EX_DOT_BLOCK_DIMS, 0); + debug_assert!(codes.len() * 8 <= ex_query.len() * $ex_bits); + + let groups = ex_query.len() / GROUP_DIMS; + let full_groups = (codes.len() / GROUP_BYTES).min(groups); + let mut acc = [vdupq_n_f32(0.0); 4]; + for group in 0..full_groups { + // SAFETY: `group < full_groups` keeps both the code group + // and the query run in bounds. + let runs = unsafe { $unpack(codes.as_ptr().add(group * GROUP_BYTES)) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_neon( + codes16, + ex_query.as_ptr().add(group * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + let consumed = full_groups * GROUP_BYTES; + if consumed < codes.len() && full_groups < groups { + // Zero-pad the final partial code group on the stack. + let mut padded = [0u8; GROUP_BYTES]; + padded[..codes.len() - consumed].copy_from_slice(&codes[consumed..]); + let runs = unsafe { $unpack(padded.as_ptr()) }; + for (run, codes16) in runs.into_iter().enumerate() { + unsafe { + fma16_neon( + codes16, + ex_query.as_ptr().add(full_groups * GROUP_DIMS + run * 16), + &mut acc, + ) + }; + } + } + vaddvq_f32(vaddq_f32( + vaddq_f32(acc[0], acc[1]), + vaddq_f32(acc[2], acc[3]), + )) + } + + fn $dispatch(ex_query: &[f32], codes: &[u8]) -> f32 { + // SAFETY: NEON is part of the aarch64 baseline. + unsafe { $name(ex_query, codes) } + } + }; + } + + neon_dot_kernel!(dot_u1_neon, dot_u1_neon_dispatch, unpack_u1, 1, 1); + neon_dot_kernel!(dot_u2_neon, dot_u2_neon_dispatch, unpack_u2, 2, 4); + neon_dot_kernel!(dot_u3_neon, dot_u3_neon_dispatch, unpack_u3, 3, 4); + neon_dot_kernel!(dot_u4_neon, dot_u4_neon_dispatch, unpack_u4, 4, 1); + neon_dot_kernel!(dot_u5_neon, dot_u5_neon_dispatch, unpack_u5, 5, 4); + neon_dot_kernel!(dot_u6_neon, dot_u6_neon_dispatch, unpack_u6, 6, 4); + neon_dot_kernel!(dot_u7_neon, dot_u7_neon_dispatch, unpack_u7, 7, 4); + neon_dot_kernel!(dot_u8_neon, dot_u8_neon_dispatch, unpack_u8x16, 8, 1); +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use rstest::rstest; + + /// Bit-pack code values sequentially (LSB-first), the on-disk ex-code layout. + fn pack_sequential(values: &[u8], ex_bits: u8) -> Vec { + let mut out = vec![0u8; (values.len() * ex_bits as usize).div_ceil(8)]; + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + out[bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + out[bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + out + } + + fn kernel_codes(values: &[u8], dim: usize, ex_bits: u8) -> Vec { + debug_assert_eq!(values.len(), dim); + let mut out = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)]; + pack_blocked_row(values, ex_bits, &mut out); + out + } + + fn available_kernels(ex_bits: u8) -> Vec<(&'static str, ExDotFn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![ + ("scalar", scalar_kernel(ex_bits)), + ("dispatched", ex_dot_kernel(ex_bits)), + ]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") + && std::arch::is_x86_feature_detected!("fma") + { + kernels.push(("avx2", x86::avx2_kernel(ex_bits))); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(("avx512", x86::avx512_kernel(ex_bits))); + } + } + kernels + } + + #[rstest] + fn test_ex_dot_matches_reference( + #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8, + #[values(7, 16, 60, 64, 100, 128, 1024, 1536, 2048)] dim: usize, + ) { + let mut rng = SmallRng::seed_from_u64(42 + ex_bits as u64 * 1000 + dim as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let query = (0..dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + + let expected = query + .iter() + .zip(values.iter()) + .map(|(q, &c)| *q as f64 * c as f64) + .sum::(); + + let codes = kernel_codes(&values, dim, ex_bits); + let mut ex_query = vec![0.0; padded_query_len(dim)]; + pad_query_into(&query, &mut ex_query); + + let tolerance = 1e-3 * expected.abs().max(1.0); + for (name, kernel) in available_kernels(ex_bits) { + let actual = kernel(&ex_query, &codes) as f64; + assert!( + (actual - expected).abs() <= tolerance, + "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}" + ); + } + } + + #[rstest] + fn test_unpack_group_roundtrip(#[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8) { + let mut rng = SmallRng::seed_from_u64(7 + ex_bits as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..EX_DOT_BLOCK_DIMS) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let codes = kernel_codes(&values, EX_DOT_BLOCK_DIMS, ex_bits); + + // Unpacking each kernel group must reproduce the values in natural + // dim order. + let dims = group_dims(ex_bits); + let bytes = group_bytes(ex_bits); + let mut unpacked = [0u8; 64]; + for group in 0..EX_DOT_BLOCK_DIMS / dims { + unpack_group( + ex_bits, + &codes[group * bytes..(group + 1) * bytes], + &mut unpacked, + ); + assert_eq!( + &unpacked[..dims], + &values[group * dims..(group + 1) * dims], + "ex_bits={ex_bits} group={group}" + ); + } + } + + /// The legacy sequential rows must repack into exactly what the writer + /// produces from the unpacked values. + #[rstest] + fn test_repack_sequential_matches_blocked( + #[values(1, 2, 3, 4, 5, 6, 7, 8)] ex_bits: u8, + #[values(7, 64, 100, 1536)] dim: usize, + ) { + let mut rng = SmallRng::seed_from_u64(11 + ex_bits as u64 * 100 + dim as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let seq = pack_sequential(&values, ex_bits); + + let mut repacked = vec![0u8; blocked_ex_code_bytes(dim, ex_bits)]; + repack_sequential_row(&seq, dim, ex_bits, &mut repacked); + assert_eq!(repacked, kernel_codes(&values, dim, ex_bits)); + + // For the widths where the sequential layout is already blocked + // (modulo trailing padding), the raw row must be a prefix. + if sequential_matches_blocked(ex_bits) { + assert_eq!(&repacked[..seq.len()], &seq); + assert!(repacked[seq.len()..].iter().all(|&byte| byte == 0)); + } + } + + /// Dense dim sweep for the bit-plane widths: every tail shape within the + /// 64-dim kernel group, plus multi-group sizes. + #[rstest] + fn test_ex_dot_plane_widths_dense_dims(#[values(3, 5)] ex_bits: u8) { + let mut rng = SmallRng::seed_from_u64(97 + ex_bits as u64); + let max_code = ((1u16 << ex_bits) - 1) as u8; + for dim in (1..=160).chain([255, 256, 1000, 1536, 2048]) { + let values = (0..dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + let query = (0..dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let expected = query + .iter() + .zip(values.iter()) + .map(|(q, &c)| *q as f64 * c as f64) + .sum::(); + + let codes = kernel_codes(&values, dim, ex_bits); + let mut ex_query = vec![0.0; padded_query_len(dim)]; + pad_query_into(&query, &mut ex_query); + let tolerance = 1e-3 * expected.abs().max(1.0); + for (name, kernel) in available_kernels(ex_bits) { + let actual = kernel(&ex_query, &codes) as f64; + assert!( + (actual - expected).abs() <= tolerance, + "ex_bits={ex_bits} dim={dim} kernel={name}: {actual} != {expected}" + ); + } + } + } + + #[test] + fn test_pad_query_pads_with_zeros() { + let query = vec![1.0f32; 100]; + let mut padded = vec![f32::NAN; padded_query_len(query.len())]; + pad_query_into(&query, &mut padded); + assert_eq!(padded.len(), 128); + assert_eq!(&padded[..100], &query[..]); + assert!(padded[100..].iter().all(|&value| value == 0.0)); + } +} diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index bd70f176c5d..36e56986921 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -41,6 +41,10 @@ use serde::{Deserialize, Serialize}; use crate::frag_reuse::FragReuseIndex; use crate::pb; use crate::vector::ApproxMode; +use crate::vector::bq::ex_dot::{ + EX_DOT_BLOCK_DIMS, ExDotFn, blocked_ex_code_bytes, ex_dot_kernel, pad_query_into, + padded_query_len, repack_sequential_row, sequential_matches_blocked, +}; use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place}; use crate::vector::bq::transform::{ ADD_FACTORS_COLUMN, ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, @@ -59,7 +63,14 @@ use crate::vector::storage::{ pub const RABIT_METADATA_KEY: &str = "lance:rabit"; pub const RABIT_CODE_COLUMN: &str = "_rabit_codes"; +/// Legacy ex-code column: sequential LSB-first bit stream per row. Read-only; +/// rows are repacked into the blocked layout at load time. pub const RABIT_EX_CODE_COLUMN: &str = "__ex_codes"; +/// Ex-code column in the blocked layout consumed by the ex-dot kernels (see +/// `ex_dot` module docs). Indexes written with this column cannot be read by +/// older versions, which fail with a missing-column error instead of +/// misinterpreting the bytes. +pub const RABIT_BLOCKED_EX_CODE_COLUMN: &str = "__blocked_ex_codes"; pub const SEGMENT_LENGTH: usize = 4; pub const SEGMENT_NUM_CODES: usize = 1 << SEGMENT_LENGTH; const RABIT_PRUNE_STATS_ENV: &str = "LANCE_RQ_PRUNE_STATS"; @@ -210,10 +221,10 @@ pub fn rabit_ex_code_field(rotated_dim: usize, num_bits: u8) -> Result(&rotated_query, &mut dist_table); - let mut ex_dist_table = vec![0.0; ex_dist_table_len]; - build_ex_dist_table_direct_into(&rotated_query, ex_bits, &mut ex_dist_table); + // The kernels consume the rotated query directly; a zero-padded copy + // is only needed when the rotated dim is not block-aligned. + let mut ex_query = Vec::new(); + if ex_bits > 0 && !code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { + ex_query.resize(padded_query_len(code_dim), 0.0); + pad_query_into(&rotated_query, &mut ex_query); + } let sum_q = rotated_query.iter().copied().sum(); Ok(RabitRawQueryContext { @@ -370,7 +381,7 @@ impl RabitQuantizationMetadata { ex_bits, rotated_query, dist_table, - ex_dist_table, + ex_query, sum_q, }) } @@ -462,6 +473,10 @@ pub struct RabitQuantizationStorage { add_factors: Float32Array, scale_factors: Float32Array, error_factors: Option, + // ex codes in the blocked kernel layout; always aliases the batch column + // (legacy sequential batches are normalized at load, replacing the + // sequential column with the repacked one, so rewrites emit the blocked + // format). ex_codes: Option, packed_ex_codes: Option, ex_add_factors: Option, @@ -560,12 +575,17 @@ impl RabitQuantizationStorage { let RabitDistCalculatorParts { dim, dist_table, - ex_dist_table, + ex_query, sum_q, query_factor, query_error, approx_mode, } = parts; + let ex_code_len = self + .ex_codes + .as_ref() + .map(|codes| codes.value_length() as usize) + .unwrap_or_default(); let ex_codes = self .ex_codes .as_ref() @@ -579,10 +599,11 @@ impl RabitQuantizationStorage { self.metadata.num_bits, self.metadata.query_estimator, dist_table, - ex_dist_table, + ex_query, sum_q, self.codes.values().as_primitive::().values(), ex_codes, + ex_code_len, self.add_factors.values(), self.scale_factors.values(), self.error_factors @@ -767,25 +788,42 @@ fn copy_subtract_f32(lhs: &[f32], rhs: &[f32], output: &mut [f32]) { struct RabitDistCalculatorParts<'a> { dim: usize, dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + ex_query: Cow<'a, [f32]>, sum_q: f32, query_factor: f32, query_error: f32, approx_mode: ApproxMode, } +/// Pick the query slice the ex-dot kernels consume: the rotated query itself +/// when the dim is block-aligned, otherwise a zero-padded copy. +fn kernel_query<'a>(rotated_query: &'a [f32], padded: &'a [f32]) -> &'a [f32] { + if rotated_query.len().is_multiple_of(EX_DOT_BLOCK_DIMS) { + rotated_query + } else { + padded + } +} + pub struct RabitDistCalculator<'a> { dim: usize, num_bits: u8, query_estimator: RabitQueryEstimator, // n * d / 8 binary-code bytes codes: &'a [u8], + // per-row ex codes in the blocked kernel layout ex_codes: Option<&'a [u8]>, + // bytes per ex-code row; legacy rows for layout-compatible widths may be + // shorter than the blocked size, which the kernels treat as zero padding + ex_code_len: usize, // this is a flattened 2D array of size d/4 * 16, // we split the query codes into d/4 chunks, each chunk is with 4 elements, // then dist_table[i][j] is the distance between the i-th query code and the code j dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + // the rotated query, zero-padded to a 64-dim multiple when needed; also + // the source for the FastScan ex LUT on the legacy bypass path + ex_query: Cow<'a, [f32]>, + ex_dot: Option, add_factors: &'a [f32], scale_factors: &'a [f32], error_factors: Option<&'a [f32]>, @@ -807,10 +845,11 @@ impl<'a> RabitDistCalculator<'a> { num_bits: u8, query_estimator: RabitQueryEstimator, dist_table: Cow<'a, [f32]>, - ex_dist_table: Cow<'a, [f32]>, + ex_query: Cow<'a, [f32]>, sum_q: f32, codes: &'a [u8], ex_codes: Option<&'a [u8]>, + ex_code_len: usize, add_factors: &'a [f32], scale_factors: &'a [f32], error_factors: Option<&'a [f32]>, @@ -821,14 +860,17 @@ impl<'a> RabitDistCalculator<'a> { query_error: f32, approx_mode: ApproxMode, ) -> Self { + let ex_dot = (num_bits > 1).then(|| ex_dot_kernel(num_bits - 1)); Self { dim, num_bits, query_estimator, codes, ex_codes, + ex_code_len, dist_table, - ex_dist_table, + ex_query, + ex_dot, add_factors, scale_factors, error_factors, @@ -843,6 +885,18 @@ impl<'a> RabitDistCalculator<'a> { } } + /// `sum_d query[d] * ex_code[d]` for the candidate's packed ex codes. + #[inline] + fn ex_code_dot(&self, ex_codes: &[u8], id: usize) -> f32 { + let ex_dot = self + .ex_dot + .expect("raw-query multi-bit RQ requires an ex-dot kernel"); + ex_dot( + self.ex_query.as_ref(), + &ex_codes[id * self.ex_code_len..(id + 1) * self.ex_code_len], + ) + } + #[allow(clippy::uninit_vec)] fn binary_distances_with_scratch( &self, @@ -1030,8 +1084,6 @@ impl<'a> RabitDistCalculator<'a> { let ex_scale_factors = self .ex_scale_factors .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = - rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated"); let code_scale = (1u32 << ex_bits) as f32; let code_bias = -(code_scale - 0.5); @@ -1039,12 +1091,11 @@ impl<'a> RabitDistCalculator<'a> { self.packed_ex_codes .map(|packed_ex_codes| { let fastscan_len = simd_len; - let fastscan_code_len = ex_fastscan_code_len(self.dim, ex_bits) - .expect("RabitQ num_bits should be validated"); + let fastscan_code_len = self.ex_code_len; let (qmin, qmax, quantization_max) = quantize_ex_fastscan_dist_table_into( - self.dim, ex_bits, - &self.ex_dist_table, + self.ex_code_len, + self.ex_query.as_ref(), quantized_dists_table, ); quantized_dists.clear(); @@ -1088,14 +1139,7 @@ impl<'a> RabitDistCalculator<'a> { .enumerate() .skip(fastscan_len) .for_each(|(id, dist)| { - let ex_dist = compute_single_rq_ex_distance( - ex_codes, - id, - ex_code_len, - ex_bits, - self.dim, - &self.ex_dist_table, - ); + let ex_dist = self.ex_code_dot(ex_codes, id); let full_dot = code_scale * *dist + ex_dist + code_bias * self.sum_q; *dist = full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor; }); @@ -1121,19 +1165,11 @@ impl<'a> RabitDistCalculator<'a> { id: usize, binary_ip: f32, ex_bits: u8, - ex_code_len: usize, ex_codes: &[u8], ex_add_factors: &[f32], ex_scale_factors: &[f32], ) -> f32 { - let ex_dist = compute_single_rq_ex_distance( - ex_codes, - id, - ex_code_len, - ex_bits, - self.dim, - &self.ex_dist_table, - ); + let ex_dist = self.ex_code_dot(ex_codes, id); let code_bias = -((1u32 << ex_bits) as f32 - 0.5); let full_dot = (1u32 << ex_bits) as f32 * binary_ip + ex_dist + code_bias * self.sum_q; full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor @@ -1180,8 +1216,6 @@ impl<'a> RabitDistCalculator<'a> { let ex_scale_factors = self .ex_scale_factors .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = - rabit_ex_code_bytes(self.dim, ex_bits).expect("RabitQ num_bits should be validated"); let query_lower_bound = lower_bound.unwrap_or(f32::MIN); let query_upper_bound = upper_bound.unwrap_or(f32::MAX); let mut max_dist = res.peek().map(|node| node.dist); @@ -1213,7 +1247,6 @@ impl<'a> RabitDistCalculator<'a> { id, binary_ip, ex_bits, - ex_code_len, ex_codes, ex_add_factors, ex_scale_factors, @@ -1276,33 +1309,6 @@ where dist_table } -fn build_ex_dist_table_direct(rotated_query: &[f32], ex_bits: u8) -> Vec { - if ex_bits == 0 { - return Vec::new(); - } - let entries_per_dim = 1usize << ex_bits; - let mut dist_table = vec![0.0; rotated_query.len() * entries_per_dim]; - build_ex_dist_table_direct_into(rotated_query, ex_bits, &mut dist_table); - dist_table -} - -fn build_ex_dist_table_direct_into(rotated_query: &[f32], ex_bits: u8, dist_table: &mut [f32]) { - if ex_bits == 0 { - debug_assert!(dist_table.is_empty()); - return; - } - let entries_per_dim = 1usize << ex_bits; - debug_assert_eq!(dist_table.len(), rotated_query.len() * entries_per_dim); - for (query_value, table) in rotated_query - .iter() - .zip(dist_table.chunks_exact_mut(entries_per_dim)) - { - for (code, value) in table.iter_mut().enumerate() { - *value = *query_value * code as f32; - } - } -} - fn build_dist_table_direct_into(qc: &[T::Native], dist_table: &mut [f32]) where T::Native: AsPrimitive, @@ -1401,33 +1407,20 @@ fn quantize_dist_table_u16_into( (qmin, qmax) } -#[inline] -fn packed_ex_code_value(row_codes: &[u8], dim_idx: usize, ex_bits: u8) -> u8 { - debug_assert!(ex_bits > 0); - let bit_offset = dim_idx * ex_bits as usize; - let byte_idx = bit_offset / u8::BITS as usize; - let bit_shift = bit_offset % u8::BITS as usize; - let bits = row_codes[byte_idx] as u16 - | row_codes - .get(byte_idx + 1) - .map(|byte| (*byte as u16) << u8::BITS) - .unwrap_or_default(); - let mask = (1u16 << ex_bits) - 1; - ((bits >> bit_shift) & mask) as u8 -} - +/// Build the u8 FastScan LUT for the ex codes directly from the rotated +/// query (`ex_query`, natural dim order, padding dims zero): the underlying +/// per-dim table is the pure multiplication `q[d] * code`, so no intermediate +/// `dim * 2^ex_bits` table is materialized. fn quantize_ex_fastscan_dist_table_into( - dim: usize, ex_bits: u8, - ex_dist_table: &[f32], + ex_code_len: usize, + ex_query: &[f32], quantized_dist_table: &mut Vec, ) -> (f32, f32, f32) { debug_assert!(supports_ex_fastscan(ex_bits)); - let entries_per_dim = 1usize << ex_bits; - debug_assert_eq!(ex_dist_table.len(), dim * entries_per_dim); - let num_split_tables = - ex_fastscan_code_len(dim, ex_bits).expect("RabitQ num_bits should be validated") * 2; + // One split table per code nibble of the row. + let num_split_tables = ex_code_len * 2; let quantization_max = (u16::MAX as usize / num_split_tables) .min(u8::MAX as usize) .max(1) as f32; @@ -1436,7 +1429,7 @@ fn quantize_ex_fastscan_dist_table_into( let mut qmax = f32::NEG_INFINITY; for table_idx in 0..num_split_tables { for code in 0..SEGMENT_NUM_CODES { - let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code); + let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code); qmin = qmin.min(value); qmax = qmax.max(value); } @@ -1452,7 +1445,7 @@ fn quantize_ex_fastscan_dist_table_into( let factor = quantization_max / (qmax - qmin); for table_idx in 0..num_split_tables { for code in 0..SEGMENT_NUM_CODES { - let value = ex_fastscan_dist_table_value(dim, ex_bits, ex_dist_table, table_idx, code); + let value = ex_fastscan_dist_table_value(ex_query, ex_bits, table_idx, code); quantized_dist_table.push(((value - qmin) * factor).round() as u8); } } @@ -1465,91 +1458,153 @@ fn supports_ex_fastscan(ex_bits: u8) -> bool { matches!(ex_bits, 2 | 4 | 8) } -#[inline] -fn ex_fastscan_code_len(dim: usize, ex_bits: u8) -> Option { - match ex_bits { - 2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(), - _ => None, - } -} - +/// The FastScan LUT value for one nibble of a blocked-layout code byte: +/// `table_idx / 2` is the byte position within a row and `table_idx % 2` +/// selects its low/high nibble (see the `ex_dot` module docs for the +/// byte-to-dim mapping per width). Dims beyond the query length (block +/// padding) contribute zero. #[inline] fn ex_fastscan_dist_table_value( - dim: usize, + ex_query: &[f32], ex_bits: u8, - ex_dist_table: &[f32], table_idx: usize, code: usize, ) -> f32 { + let query = |dim_idx: usize| ex_query.get(dim_idx).copied().unwrap_or(0.0); + let byte_idx = table_idx / 2; + let high_nibble = table_idx % 2 == 1; match ex_bits { 2 => { - let dim_idx = table_idx * 2; - let low = code & 0b11; - let high = (code >> 2) & 0b11; - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, low) - + ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx + 1, high) + // byte 16g+b = dims {64g+b, +16, +32, +48} at bit pairs; the low + // nibble covers the first two dims, the high nibble the last two. + let dim_idx = 64 * (byte_idx / 16) + byte_idx % 16 + 32 * usize::from(high_nibble); + let low = (code & 0b11) as f32; + let high = ((code >> 2) & 0b11) as f32; + query(dim_idx) * low + query(dim_idx + 16) * high + } + 4 => { + // byte 32g+8j+b = dim 64g+16j+b (low nibble) | dim +8 (high). + let in_block = byte_idx % 32; + let dim_idx = 64 * (byte_idx / 32) + + 16 * (in_block / 8) + + in_block % 8 + + 8 * usize::from(high_nibble); + query(dim_idx) * code as f32 } - 4 => ex_dist_table_value(ex_dist_table, dim, ex_bits, table_idx, code), 8 => { - let dim_idx = table_idx / 2; - if table_idx.is_multiple_of(2) { - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code) + // byte = dim identity; the high nibble carries code bits 4..8. + let code = if high_nibble { + code << SEGMENT_LENGTH } else { - ex_dist_table_value(ex_dist_table, dim, ex_bits, dim_idx, code << SEGMENT_LENGTH) - } + code + }; + query(byte_idx) * code as f32 } _ => unreachable!("unsupported RabitQ ex_bits={ex_bits} for FastScan"), } } -#[inline] -fn ex_dist_table_value( - ex_dist_table: &[f32], - dim: usize, - ex_bits: u8, - dim_idx: usize, - code: usize, -) -> f32 { - if dim_idx >= dim { - return 0.0; - } - let entries_per_dim = 1usize << ex_bits; - ex_dist_table[dim_idx * entries_per_dim + code] -} - -#[inline] -fn compute_single_rq_ex_distance( - ex_codes: &[u8], - id: usize, - ex_code_len: usize, - ex_bits: u8, - dim: usize, - ex_dist_table: &[f32], -) -> f32 { - if ex_bits == 0 { - return 0.0; - } - let entries_per_dim = 1usize << ex_bits; - let row_codes = &ex_codes[id * ex_code_len..(id + 1) * ex_code_len]; - (0..dim) - .map(|dim_idx| { - let code = packed_ex_code_value(row_codes, dim_idx, ex_bits) as usize; - ex_dist_table[dim_idx * entries_per_dim + code] - }) - .sum() -} - +/// Transpose ex codes for the FastScan bulk path. That path is only reachable +/// when lower-bound gating is disabled, i.e. for legacy indexes without error +/// factors; gated indexes rerank per candidate with the ex-dot kernels and +/// never touch this copy, so skip the transpose (and its resident memory). fn maybe_pack_ex_codes( ex_codes: Option<&FixedSizeListArray>, ex_bits: u8, + error_factors: Option<&Float32Array>, ) -> Option { let ex_codes = ex_codes?; + if error_factors.is_some() { + return None; + } match ex_bits { 2 | 4 | 8 => Some(pack_codes(ex_codes)), _ => None, } } +/// Bring legacy sequential ex codes into the blocked kernel layout: rows are +/// repacked, except for the widths whose layouts agree byte-for-byte (then +/// the column is used as stored). +fn blocked_ex_codes_from_sequential( + seq_codes: &FixedSizeListArray, + dim: usize, + ex_bits: u8, +) -> Result { + if sequential_matches_blocked(ex_bits) + && seq_codes.value_length() as usize == blocked_ex_code_bytes(dim, ex_bits) + { + return Ok(seq_codes.clone()); + } + let seq_code_len = seq_codes.value_length() as usize; + let seq_values = seq_codes.values().as_primitive::().values(); + let blocked_code_len = blocked_ex_code_bytes(dim, ex_bits); + let mut blocked_values = vec![0u8; seq_codes.len() * blocked_code_len]; + for (seq_row, blocked_row) in seq_values + .chunks_exact(seq_code_len) + .zip(blocked_values.chunks_exact_mut(blocked_code_len)) + { + repack_sequential_row(seq_row, dim, ex_bits, blocked_row); + } + Ok(FixedSizeListArray::try_new_from_values( + UInt8Array::from(blocked_values), + blocked_code_len as i32, + )?) +} + +/// Load the ex-code column of an index batch into the blocked kernel layout, +/// accepting both the blocked format and the legacy sequential format. Legacy +/// batches are normalized in place (the sequential column is replaced by the +/// blocked one), so rewrites — remap, optimize merges — always emit the +/// blocked format and legacy indexes upgrade on their next rewrite. +pub(crate) fn load_blocked_ex_codes( + batch: RecordBatch, + rotated_dim: usize, + num_bits: u8, +) -> Result<(RecordBatch, FixedSizeListArray)> { + let ex_bits = rabit_ex_bits(num_bits)?; + if let Some(column) = batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN) { + let codes = column.as_fixed_size_list().clone(); + let expected_bytes = blocked_ex_code_bytes(rotated_dim, ex_bits); + if codes.value_length() as usize != expected_bytes { + return Err(Error::invalid_input(format!( + "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", + RABIT_BLOCKED_EX_CODE_COLUMN, + codes.value_length(), + rotated_dim, + ex_bits, + expected_bytes + ))); + } + return Ok((batch, codes)); + } + let column = batch.column_by_name(RABIT_EX_CODE_COLUMN).ok_or_else(|| { + Error::invalid_input(format!( + "RabitQ num_bits={} requires {} column", + num_bits, RABIT_BLOCKED_EX_CODE_COLUMN + )) + })?; + let codes = column.as_fixed_size_list().clone(); + let expected_bytes = rabit_ex_code_bytes(rotated_dim, ex_bits)?; + if codes.value_length() as usize != expected_bytes { + return Err(Error::invalid_input(format!( + "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", + RABIT_EX_CODE_COLUMN, + codes.value_length(), + rotated_dim, + ex_bits, + expected_bytes + ))); + } + let blocked = blocked_ex_codes_from_sequential(&codes, rotated_dim, ex_bits)?; + let ex_code_field = rabit_ex_code_field(rotated_dim, num_bits)? + .expect("multi-bit RabitQ always has an ex-code field"); + let batch = batch + .drop_column(RABIT_EX_CODE_COLUMN)? + .try_with_column(ex_code_field, Arc::new(blocked.clone()))?; + Ok((batch, blocked)) +} + impl DistCalculator for RabitDistCalculator<'_> { #[inline(always)] fn distance(&self, id: u32) -> f32 { @@ -1580,13 +1635,10 @@ impl DistCalculator for RabitDistCalculator<'_> { let ex_scale_factors = self .ex_scale_factors .expect("raw-query multi-bit RQ requires ex scale factors"); - let ex_code_len = rabit_ex_code_bytes(self.dim, ex_bits) - .expect("RabitQ num_bits should be validated"); self.raw_query_multi_bit_exact_distance( id, dist, ex_bits, - ex_code_len, ex_codes, ex_add_factors, ex_scale_factors, @@ -1865,8 +1917,6 @@ impl VectorStore for RabitQuantizationStorage { let code_dim = self.code_dim(); let rotated_qr = self.rotate_query_vector(code_dim, &qr); let dist_table = build_dist_table_direct::(&rotated_qr); - let ex_bits = self.metadata.num_bits - 1; - let ex_dist_table = build_ex_dist_table_direct(&rotated_qr, ex_bits); let query_factor = match self.metadata.query_estimator { RabitQueryEstimator::ResidualQuery => self.residual_query_factor(dist_q_c), RabitQueryEstimator::RawQuery => self.raw_query_factor(dist_q_c, &rotated_qr, None), @@ -1877,12 +1927,21 @@ impl VectorStore for RabitQuantizationStorage { self.raw_query_error_for_gating(dist_q_c, &rotated_qr, None) } }; - let sum_q = rotated_qr.into_iter().sum(); + let sum_q = rotated_qr.iter().copied().sum(); + // The kernels read the rotated query directly; only unaligned dims + // need a zero-padded copy. + let ex_query = if code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { + rotated_qr + } else { + let mut padded = vec![0.0; padded_query_len(code_dim)]; + pad_query_into(&rotated_qr, &mut padded); + padded + }; self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, dist_table: Cow::Owned(dist_table), - ex_dist_table: Cow::Owned(ex_dist_table), + ex_query: Cow::Owned(ex_query), sum_q, query_factor, query_error, @@ -1921,7 +1980,10 @@ impl VectorStore for RabitQuantizationStorage { return self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, dist_table: Cow::Borrowed(&raw_query.dist_table), - ex_dist_table: Cow::Borrowed(&raw_query.ex_dist_table), + ex_query: Cow::Borrowed(kernel_query( + &raw_query.rotated_query, + &raw_query.ex_query, + )), sum_q: raw_query.sum_q, query_factor, query_error, @@ -1931,18 +1993,20 @@ impl VectorStore for RabitQuantizationStorage { let dist_table_len = code_dim * 4; let ex_bits = self.metadata.num_bits - 1; - let ex_dist_table_len = if ex_bits == 0 { + // The kernels read the rotated query in place; a zero-padded copy is + // only needed when the rotated dim is not block-aligned. + let ex_query_table_len = if ex_bits == 0 || code_dim.is_multiple_of(EX_DOT_BLOCK_DIMS) { 0 } else { - code_dim * (1usize << ex_bits) + padded_query_len(code_dim) }; - f32_scratch.resize(code_dim + dist_table_len + ex_dist_table_len, 0.0); + f32_scratch.resize(code_dim + dist_table_len + ex_query_table_len, 0.0); let query_factor; let query_error; let sum_q = { let (rotated_qr, remaining) = f32_scratch.split_at_mut(code_dim); - let (dist_table, ex_dist_table) = remaining.split_at_mut(dist_table_len); + let (dist_table, ex_query) = remaining.split_at_mut(dist_table_len); match residual { Some(QueryResidual::Centroid(residual_centroid)) => { self.rotate_query_vector_into( @@ -1981,17 +2045,20 @@ impl VectorStore for RabitQuantizationStorage { } }; build_dist_table_direct_into::(rotated_qr, dist_table); - build_ex_dist_table_direct_into(rotated_qr, ex_bits, ex_dist_table); + if ex_query_table_len > 0 { + pad_query_into(rotated_qr, ex_query); + } rotated_qr.iter().copied().sum() }; + let ex_query_start = code_dim + dist_table_len; self.distance_calculator_from_parts(RabitDistCalculatorParts { dim: code_dim, - dist_table: Cow::Borrowed(&f32_scratch[code_dim..code_dim + dist_table_len]), - ex_dist_table: Cow::Borrowed( - &f32_scratch - [code_dim + dist_table_len..code_dim + dist_table_len + ex_dist_table_len], - ), + dist_table: Cow::Borrowed(&f32_scratch[code_dim..ex_query_start]), + ex_query: Cow::Borrowed(kernel_query( + &f32_scratch[..code_dim], + &f32_scratch[ex_query_start..ex_query_start + ex_query_table_len], + )), sum_q, query_factor, query_error, @@ -2192,31 +2259,14 @@ impl QuantizerStorage for RabitQuantizationStorage { .column_by_name(ERROR_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); let ex_bits = rabit_ex_bits(metadata.num_bits)?; + let mut batch = batch; let mut ex_codes = None; let mut ex_add_factors = None; let mut ex_scale_factors = None; if ex_bits != 0 { - let codes = batch - .column_by_name(RABIT_EX_CODE_COLUMN) - .ok_or_else(|| { - Error::invalid_input(format!( - "RabitQ num_bits={} requires {} column", - metadata.num_bits, RABIT_EX_CODE_COLUMN - )) - })? - .as_fixed_size_list() - .clone(); - let expected_ex_code_bytes = rabit_ex_code_bytes(metadata.rotated_dim(), ex_bits)?; - if codes.value_length() as usize != expected_ex_code_bytes { - return Err(Error::invalid_input(format!( - "RabitQ ex-code byte width mismatch: column {} has {} bytes, metadata rotated_dim={} ex_bits={} requires {} bytes", - RABIT_EX_CODE_COLUMN, - codes.value_length(), - metadata.rotated_dim(), - ex_bits, - expected_ex_code_bytes - ))); - } + let (normalized_batch, codes) = + load_blocked_ex_codes(batch, metadata.rotated_dim(), metadata.num_bits)?; + batch = normalized_batch; ex_codes = Some(codes); ex_add_factors = Some( batch @@ -2246,16 +2296,19 @@ impl QuantizerStorage for RabitQuantizationStorage { if batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some() || batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some() || batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() { return Err(Error::invalid_input( "RabitQ num_bits=1 raw-query indexes must not contain ex-code columns" .to_string(), )); } - } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() { + } else if batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() + { return Err(Error::invalid_input(format!( - "RabitQ num_bits={} does not support {} column", - metadata.num_bits, RABIT_EX_CODE_COLUMN + "RabitQ num_bits={} does not support ex-code columns", + metadata.num_bits ))); } @@ -2270,7 +2323,8 @@ impl QuantizerStorage for RabitQuantizationStorage { let mut metadata = metadata.clone(); metadata.packed = true; - let packed_ex_codes = maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits); + let packed_ex_codes = + maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref()); Ok(Self { metadata, @@ -2353,11 +2407,18 @@ impl QuantizerStorage for RabitQuantizationStorage { let error_factors = batch .column_by_name(ERROR_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); - let ex_codes = batch - .column_by_name(RABIT_EX_CODE_COLUMN) - .map(|codes| codes.as_fixed_size_list().clone()); + let ex_bits = rabit_ex_bits(self.metadata.num_bits)?; + let (batch, ex_codes) = if ex_bits == 0 { + (batch, None) + } else { + // `self.batch` is already normalized at load, so this is a + // zero-copy column lookup. + let (batch, codes) = + load_blocked_ex_codes(batch, self.metadata.rotated_dim(), self.metadata.num_bits)?; + (batch, Some(codes)) + }; let packed_ex_codes = - maybe_pack_ex_codes(ex_codes.as_ref(), rabit_ex_bits(self.metadata.num_bits)?); + maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref()); let ex_add_factors = batch .column_by_name(EX_ADD_FACTORS_COLUMN) .map(|factors| factors.as_primitive::().clone()); @@ -2695,7 +2756,7 @@ mod tests { assert!(rabit_ex_code_field(128, 1).unwrap().is_none()); let ex_field = rabit_ex_code_field(128, 9).unwrap().unwrap(); - assert_eq!(ex_field.name(), RABIT_EX_CODE_COLUMN); + assert_eq!(ex_field.name(), RABIT_BLOCKED_EX_CODE_COLUMN); let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { panic!("ex-code field should be FixedSizeList"); }; @@ -2898,6 +2959,229 @@ mod tests { assert_eq!(distances, vec![104.0, 22.0]); } + /// Exercise the ex-dot kernel through the storage API for every ex width, + /// including the widths without FastScan support ({1, 3, 5, 6, 7}), and a + /// dim that is not a multiple of the 64-dim kernel group. + /// + /// The dim must be a multiple of 8: the binary distance stage consumes + /// two 4-dim segments per code byte and ignores trailing dims otherwise. + #[test] + fn test_raw_query_multi_bit_distance_matches_reference_for_all_ex_widths() { + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + // 72 exercises the kernels' padded-tail path; 1536 is a production + // embedding dim exercising the full-group path. Both the blocked + // format and the legacy sequential format must produce the same + // distances. + for (code_dim, num_rows) in [(72usize, 33usize), (1536, 33)] { + for num_bits in 2..=9u8 { + for legacy_format in [false, true] { + let ex_bits = num_bits - 1; + let mut rng = SmallRng::seed_from_u64(num_bits as u64); + + let sign_bits = (0..num_rows * code_dim) + .map(|_| rng.random_bool(0.5)) + .collect::>(); + let max_code = ((1u16 << ex_bits) - 1) as u8; + let ex_values = (0..num_rows * code_dim) + .map(|_| rng.random_range(0..=max_code)) + .collect::>(); + + let code_len = rabit_binary_code_bytes(code_dim); + let mut code_bytes = vec![0u8; num_rows * code_len]; + for (row, bits) in sign_bits.chunks_exact(code_dim).enumerate() { + for (dim, &bit) in bits.iter().enumerate() { + code_bytes[row * code_len + dim / 8] |= (bit as u8) << (dim % 8); + } + } + let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format { + let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_values.chunks_exact(code_dim).enumerate() { + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |= + (bits >> 8) as u8; + } + } + } + (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + } else { + let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_code_bytes + .chunks_exact_mut(ex_code_len) + .zip(ex_values.chunks_exact(code_dim)) + { + crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row); + } + (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + }; + + let identity = Float32Array::from_iter_values((0..code_dim).flat_map(|row| { + (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 }) + })); + let rotate_mat = + FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap(); + let metadata = RabitQuantizationMetadata { + rotate_mat: Some(rotate_mat), + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Matrix, + code_dim: code_dim as u32, + num_bits, + packed: false, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from(code_bytes), + code_len as i32, + ) + .unwrap(); + let ex_codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_code_bytes), + ex_code_len as i32, + ) + .unwrap(); + let ex_add_factors = (0..num_rows) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let ex_scale_factors = (0..num_rows) + .map(|_| rng.random_range(0.1f32..1.0)) + .collect::>(); + let batch = RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..num_rows as u64)) as ArrayRef, + ), + (RABIT_CODE_COLUMN, Arc::new(codes) as ArrayRef), + ( + ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, + ), + ( + SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, + ), + (ex_code_column, Arc::new(ex_codes) as ArrayRef), + ( + EX_ADD_FACTORS_COLUMN, + Arc::new(Float32Array::from(ex_add_factors.clone())) as ArrayRef, + ), + ( + EX_SCALE_FACTORS_COLUMN, + Arc::new(Float32Array::from(ex_scale_factors.clone())) as ArrayRef, + ), + ]) + .unwrap(); + let storage = RabitQuantizationStorage::try_from_batch( + batch, + &metadata, + DistanceType::L2, + None, + ) + .unwrap(); + + let query = (0..code_dim) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + let sum_q = query.iter().sum::(); + let calc = storage.dist_calculator( + Arc::new(Float32Array::from(query.clone())) as ArrayRef, + 0.0, + ); + + let code_scale = (1u32 << ex_bits) as f32; + let code_bias = -(code_scale - 0.5); + let expected = (0..num_rows) + .map(|row| { + let binary_ip = (0..code_dim) + .map(|dim| { + query[dim] * sign_bits[row * code_dim + dim] as u8 as f32 + }) + .sum::(); + let ex_dist = (0..code_dim) + .map(|dim| query[dim] * ex_values[row * code_dim + dim] as f32) + .sum::(); + let full_dot = code_scale * binary_ip + ex_dist + code_bias * sum_q; + full_dot * ex_scale_factors[row] + ex_add_factors[row] + }) + .collect::>(); + + for (row, &want) in expected.iter().enumerate() { + let got = calc.distance(row as u32); + assert!( + (got - want).abs() <= 1e-3 * want.abs().max(1.0), + "num_bits={num_bits} row={row}: {got} != {want}" + ); + } + + let mut distances = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + calc.distance_all_with_scratch( + 0, + &mut distances, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + assert_eq!(distances.len(), num_rows); + // The bulk path quantizes the binary LUT to u8, and that error is + // amplified by 2^ex_bits in the multi-bit estimate, so the value + // assertions need a quantization-aware bound. The FastScan ex + // widths additionally quantize the ex LUT and are covered by + // `test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes`. + if !matches!(ex_bits, 2 | 4 | 8) { + // Worst-case |error| of one u8-quantized binary LUT lookup is + // (table range) / 255 / 2, accumulated over one lookup per + // 8-dim pair of segments. + let num_tables = code_dim.div_ceil(4); + let mut table_min = f32::INFINITY; + let mut table_max = f32::NEG_INFINITY; + for segment in query.chunks(4) { + for subset in 0..16usize { + let value = segment + .iter() + .enumerate() + .filter(|(idx, _)| subset & (1 << idx) != 0) + .map(|(_, q)| *q) + .sum::(); + table_min = table_min.min(value); + table_max = table_max.max(value); + } + } + let binary_bound = + code_scale * num_tables as f32 * (table_max - table_min) / 255.0 / 2.0 + * ex_scale_factors.iter().fold(0.0f32, |max, &s| max.max(s)); + for (row, (&got, &want)) in + distances.iter().zip(expected.iter()).enumerate() + { + assert!( + (got - want).abs() <= binary_bound + 1e-3, + "num_bits={num_bits} row={row} (distance_all): {got} != {want} (bound {binary_bound})" + ); + } + // Rows past the SIMD batch use the exact binary path, so the + // final remainder row must match the per-candidate distance. + let remainder_row = num_rows - 1; + let got = distances[remainder_row]; + let want = calc.distance(remainder_row as u32); + assert!( + (got - want).abs() <= 1e-3 * want.abs().max(1.0), + "num_bits={num_bits} remainder row (distance_all): {got} != {want}" + ); + } + } + } + } + } + #[test] fn test_fast_approx_mode_uses_one_bit_scores_for_multi_bit_raw_query() { let code_dim = 8usize; @@ -3061,10 +3345,17 @@ mod tests { assert_eq!(hacc_accum_len, num_rows); } - fn assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits: u8) { - let code_dim = 8usize; + fn assert_raw_query_multi_bit_distance_all_uses_fastscan( + num_bits: u8, + legacy_format: bool, + with_error_factors: bool, + ) { + // Not a multiple of 64, so the padded-tail LUT entries are exercised; + // a multiple of 8 as the binary stage requires. + let code_dim = 72usize; let num_rows = BATCH_SIZE + 1; let ex_bits = rabit_ex_bits(num_bits).unwrap(); + let max_code = ((1u16 << ex_bits) - 1) as u8; let identity = Float32Array::from_iter_values( (0..code_dim) .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })), @@ -3081,16 +3372,42 @@ mod tests { packed: false, query_estimator: RabitQueryEstimator::RawQuery, }; + let code_len = rabit_binary_code_bytes(code_dim); let codes = FixedSizeListArray::try_new_from_values( - UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 13) as u8)), - 1, + UInt8Array::from_iter_values((0..num_rows * code_len).map(|idx| (idx * 13) as u8)), + code_len as i32, ) .unwrap(); - let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let ex_values = (0..num_rows * code_dim) + .map(|idx| ((idx * 37) % (max_code as usize + 1)) as u8) + .collect::>(); + let (ex_code_column, ex_code_len, ex_code_bytes) = if legacy_format { + let ex_code_len = rabit_ex_code_bytes(code_dim, ex_bits).unwrap(); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_values.chunks_exact(code_dim).enumerate() { + for (dim, &value) in values.iter().enumerate() { + let bit_offset = dim * ex_bits as usize; + let bits = (value as u16) << (bit_offset % 8); + ex_code_bytes[row * ex_code_len + bit_offset / 8] |= bits as u8; + if bits >> 8 != 0 { + ex_code_bytes[row * ex_code_len + bit_offset / 8 + 1] |= (bits >> 8) as u8; + } + } + } + (RABIT_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + } else { + let ex_code_len = blocked_ex_code_bytes(code_dim, ex_bits); + let mut ex_code_bytes = vec![0u8; num_rows * ex_code_len]; + for (row, values) in ex_code_bytes + .chunks_exact_mut(ex_code_len) + .zip(ex_values.chunks_exact(code_dim)) + { + crate::vector::bq::ex_dot::pack_blocked_row(values, ex_bits, row); + } + (RABIT_BLOCKED_EX_CODE_COLUMN, ex_code_len, ex_code_bytes) + }; let ex_codes = FixedSizeListArray::try_new_from_values( - UInt8Array::from_iter_values( - (0..num_rows * ex_code_len).map(|idx| (idx * 37 % 251) as u8), - ), + UInt8Array::from(ex_code_bytes), ex_code_len as i32, ) .unwrap(); @@ -3108,7 +3425,7 @@ mod tests { SCALE_FACTORS_COLUMN, Arc::new(Float32Array::from(vec![1.0; num_rows])) as ArrayRef, ), - (RABIT_EX_CODE_COLUMN, Arc::new(ex_codes) as ArrayRef), + (ex_code_column, Arc::new(ex_codes) as ArrayRef), ( EX_ADD_FACTORS_COLUMN, Arc::new(Float32Array::from(vec![0.0; num_rows])) as ArrayRef, @@ -3119,12 +3436,30 @@ mod tests { ), ]) .unwrap(); + let batch = if with_error_factors { + batch + .try_with_column( + crate::vector::bq::transform::ERROR_FACTORS_FIELD.clone(), + Arc::new(Float32Array::from(vec![1000.0; num_rows])) as ArrayRef, + ) + .unwrap() + } else { + batch + }; let storage = RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) .unwrap(); - assert!(storage.packed_ex_codes.is_some()); + // The FastScan transpose only exists for indexes that can reach the + // bulk bypass path (no error factors); gated indexes fall through to + // the exact per-row kernels in `distance_all`. + assert_eq!(storage.packed_ex_codes.is_some(), !with_error_factors); - let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef; + // A per-dim varying query so that any dim-mapping error in the + // FastScan LUT shows up as a value mismatch. + let query_values = (0..code_dim) + .map(|dim| (dim % 11) as f32 * 0.3 - 1.5) + .collect::>(); + let query = Arc::new(Float32Array::from(query_values.clone())) as ArrayRef; let calc = storage.dist_calculator(query, 0.0); let mut distances = Vec::new(); let mut u16_scratch = Vec::new(); @@ -3140,15 +3475,57 @@ mod tests { assert_eq!(distances.len(), num_rows); assert_eq!(u16_scratch.len(), BATCH_SIZE); - assert_eq!( - u8_scratch.len(), - ex_fastscan_code_len(code_dim, ex_bits).unwrap() * 2 * SEGMENT_NUM_CODES + let loaded_ex_code_len = storage.ex_codes.as_ref().unwrap().value_length() as usize; + if with_error_factors { + // The gated path never builds the ex LUT; the scratch holds the + // binary LUT only. + assert_eq!(u8_scratch.len(), code_dim * 4); + } else { + assert_eq!(u8_scratch.len(), loaded_ex_code_len * 2 * SEGMENT_NUM_CODES); + } + + // The fastscan estimate differs from the exact path only by the u8 + // quantization of the binary LUT (amplified by 2^ex_bits) and of the + // ex LUT, so bound the comparison by those quantization errors. + let mut table_min = f32::INFINITY; + let mut table_max = f32::NEG_INFINITY; + for segment in query_values.chunks(4) { + for subset in 0..SEGMENT_NUM_CODES { + let value = segment + .iter() + .enumerate() + .filter(|(idx, _)| subset & (1 << idx) != 0) + .map(|(_, q)| *q) + .sum::(); + table_min = table_min.min(value); + table_max = table_max.max(value); + } + } + let code_scale = (1u32 << ex_bits) as f32; + let binary_bound = + code_scale * code_dim.div_ceil(4) as f32 * (table_max - table_min) / 510.0; + let mut padded_query = vec![0.0f32; crate::vector::bq::ex_dot::padded_query_len(code_dim)]; + crate::vector::bq::ex_dot::pad_query_into(&query_values, &mut padded_query); + let mut quantized_table = Vec::new(); + let (ex_qmin, ex_qmax, ex_qcap) = quantize_ex_fastscan_dist_table_into( + ex_bits, + loaded_ex_code_len, + &padded_query, + &mut quantized_table, ); + // Without the FastScan transpose the ex stage is exact, so only the + // binary LUT quantization remains. + let ex_bound = if with_error_factors { + 0.0 + } else { + (loaded_ex_code_len * 2) as f32 * (ex_qmax - ex_qmin) / ex_qcap / 2.0 + }; + let bound = (binary_bound + ex_bound) * 1.5 + 1e-3; for (id, distance) in distances.iter().take(BATCH_SIZE).enumerate() { let exact = calc.distance(id as u32); assert!( - (*distance - exact).abs() < 10.0, - "distance_all fastscan mismatch for id {id}: actual={distance}, exact={exact}" + (*distance - exact).abs() <= bound, + "distance_all fastscan mismatch for id {id} (num_bits={num_bits} legacy={legacy_format}): actual={distance}, exact={exact}, bound={bound}" ); } assert_eq!(distances[BATCH_SIZE], calc.distance(BATCH_SIZE as u32)); @@ -3156,8 +3533,17 @@ mod tests { #[test] fn test_raw_query_multi_bit_distance_all_uses_fastscan_for_split_ex_codes() { - for num_bits in [3, 9] { - assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits); + for num_bits in [3, 5, 9] { + for legacy_format in [false, true] { + assert_raw_query_multi_bit_distance_all_uses_fastscan( + num_bits, + legacy_format, + false, + ); + } + // Gated indexes (with error factors) skip the FastScan artifacts + // and score the bulk path with the exact kernels. + assert_raw_query_multi_bit_distance_all_uses_fastscan(num_bits, false, true); } } @@ -3239,7 +3625,6 @@ mod tests { id, binary_ip, ex_bits, - ex_code_len, ex_codes, ex_add_factors, ex_scale_factors, @@ -3457,7 +3842,8 @@ mod tests { ) .unwrap_err(); assert!( - err.to_string().contains("requires __ex_codes column"), + err.to_string() + .contains("requires __blocked_ex_codes column"), "{}", err ); @@ -3501,9 +3887,11 @@ mod tests { .unwrap(); assert!(storage.metadata().packed); + // Legacy batches are normalized to the blocked column at load. let stored_batch = storage.to_batches().unwrap().next().unwrap(); + assert!(stored_batch.column_by_name(RABIT_EX_CODE_COLUMN).is_none()); assert_eq!( - stored_batch[RABIT_EX_CODE_COLUMN] + stored_batch[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), 64 @@ -3571,11 +3959,19 @@ mod tests { #[test] fn test_remap_preserves_multi_bit_rq_split_columns() { + // num_bits=9 keeps sequential ex codes; num_bits 4/6/8 (ex_bits + // 3/5/7) also exercise the bit-plane repack rebuild in `remap`. + for num_bits in [4, 6, 8, 9u8] { + test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits); + } + } + + fn test_remap_preserves_multi_bit_rq_split_columns_impl(num_bits: u8) { let original_codes = make_test_codes(50, 64); let code_dim = original_codes.value_length() as usize * 8; - let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, 9); + let ex_codes = make_test_ex_codes(original_codes.len(), code_dim, num_bits); let mut metadata = make_test_metadata(code_dim); - metadata.num_bits = 9; + metadata.num_bits = num_bits; let storage = RabitQuantizationStorage::try_from_batch( make_test_batch_with_ex(original_codes.clone(), ex_codes), &metadata, @@ -3599,11 +3995,14 @@ mod tests { ); assert_eq!(remapped_row_ids, expected_row_ids.values()); + // Legacy batches are normalized to the blocked format at load, so the + // remapped batch carries the blocked column. + let ex_code_len = blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap()); assert_eq!( - remapped_batch[RABIT_EX_CODE_COLUMN] + remapped_batch[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), - 64 + ex_code_len as i32 ); assert_eq!( &remapped_batch[EX_ADD_FACTORS_COLUMN] @@ -3623,5 +4022,20 @@ mod tests { .values()[..5], &[0.25, 1.25, 2.25, 4.25, 5.25] ); + + // The remapped storage must hold the same kernel-layout ex codes as a + // storage freshly loaded from the remapped batch. + let reloaded = RabitQuantizationStorage::try_from_batch( + remapped_batch, + &remapped.metadata, + DistanceType::L2, + None, + ) + .unwrap(); + assert_eq!(remapped.ex_codes, reloaded.ex_codes); + assert_eq!( + remapped.ex_codes.as_ref().unwrap().value_length() as usize, + blocked_ex_code_bytes(code_dim, rabit_ex_bits(num_bits).unwrap()) + ); } } diff --git a/rust/lance-index/src/vector/bq/transform.rs b/rust/lance-index/src/vector/bq/transform.rs index c2fc0608102..c87695e14cd 100644 --- a/rust/lance-index/src/vector/bq/transform.rs +++ b/rust/lance-index/src/vector/bq/transform.rs @@ -17,7 +17,9 @@ use tracing::instrument; use crate::vector::bq::builder::RabitQuantizer; use crate::vector::bq::rabit_ex_bits; -use crate::vector::bq::storage::{RABIT_CODE_COLUMN, RABIT_EX_CODE_COLUMN, RabitQueryEstimator}; +use crate::vector::bq::storage::{ + RABIT_BLOCKED_EX_CODE_COLUMN, RABIT_CODE_COLUMN, RabitQueryEstimator, +}; use crate::vector::quantizer::Quantization; use crate::vector::transform::Transformer; use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; @@ -281,7 +283,7 @@ impl Transformer for RQTransformer { #[instrument(name = "RQTransformer::transform", level = "debug", skip_all)] fn transform(&self, batch: &RecordBatch) -> Result { let has_split_codes = self.rq.num_bits() == 1 - || (batch.column_by_name(RABIT_EX_CODE_COLUMN).is_some() + || (batch.column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN).is_some() && batch.column_by_name(EX_ADD_FACTORS_COLUMN).is_some() && batch.column_by_name(EX_SCALE_FACTORS_COLUMN).is_some()); if batch.column_by_name(RABIT_CODE_COLUMN).is_some() && has_split_codes { @@ -494,7 +496,8 @@ mod tests { use crate::vector::bq::RQRotationType; use crate::vector::bq::builder::RabitQuantizer; - use crate::vector::bq::storage::RABIT_EX_CODE_COLUMN; + use crate::vector::bq::ex_dot::blocked_ex_code_bytes; + use crate::vector::bq::storage::RABIT_BLOCKED_EX_CODE_COLUMN; use crate::vector::transform::Transformer; use crate::vector::{CENTROID_DIST_COLUMN, PART_ID_COLUMN}; @@ -535,15 +538,19 @@ mod tests { .unwrap(); let transformed = transformer.transform(&batch).unwrap(); - assert!(transformed.column_by_name(RABIT_EX_CODE_COLUMN).is_some()); + assert!( + transformed + .column_by_name(RABIT_BLOCKED_EX_CODE_COLUMN) + .is_some() + ); assert_eq!( - transformed[RABIT_EX_CODE_COLUMN] + transformed[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .value_length(), - 3 + blocked_ex_code_bytes(8, 3) as i32 ); assert!( - transformed[RABIT_EX_CODE_COLUMN] + transformed[RABIT_BLOCKED_EX_CODE_COLUMN] .as_fixed_size_list() .values() .as_primitive::() diff --git a/rust/lance-index/src/vector/distributed/index_merger.rs b/rust/lance-index/src/vector/distributed/index_merger.rs index 5f59985673e..70371ad4794 100755 --- a/rust/lance-index/src/vector/distributed/index_merger.rs +++ b/rust/lance-index/src/vector/distributed/index_merger.rs @@ -1440,6 +1440,25 @@ pub async fn merge_partial_vector_auxiliary_files( ))); } + // Shards written by older lance versions carry sequential ex + // codes; normalize every batch to the blocked layout before + // concatenation so mixed-version shards merge correctly + // (concat_batches combines columns by position and would + // otherwise mix the two layouts silently). + let batches = match rq_meta.as_ref() { + Some(meta) if meta.num_bits > 1 => batches + .into_iter() + .map(|batch| { + crate::vector::bq::storage::load_blocked_ex_codes( + batch, + meta.rotated_dim(), + meta.num_bits, + ) + .map(|(batch, _)| batch) + }) + .collect::>>()?, + _ => batches, + }; let schema = batches[0].schema(); let partition_batch = concat_batches(&schema, batches.iter())?; if let Some(w) = v2w_opt.as_mut() { @@ -1527,7 +1546,7 @@ mod tests { use prost::Message; use crate::vector::bq::RQRotationType; - use crate::vector::bq::storage::{RABIT_EX_CODE_COLUMN, RabitQueryEstimator}; + use crate::vector::bq::storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQueryEstimator}; use crate::vector::bq::transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}; lance_testing::define_stage_event_progress!( RecordingProgress, @@ -2529,11 +2548,14 @@ mod tests { let batch = batch.unwrap(); if !checked_split_columns { let schema = batch.schema(); - let ex_code_field = schema.field_with_name(RABIT_EX_CODE_COLUMN).unwrap(); + let ex_code_field = schema + .field_with_name(RABIT_BLOCKED_EX_CODE_COLUMN) + .unwrap(); let DataType::FixedSizeList(_, ex_code_bytes) = ex_code_field.data_type() else { panic!("RQ ex-code field should be FixedSizeList"); }; - assert_eq!(*ex_code_bytes, 6); + // code_dim=16 padded to one 64-dim block at ex_bits=3. + assert_eq!(*ex_code_bytes, 24); assert!(schema.field_with_name(ERROR_FACTORS_FIELD.name()).is_ok()); assert!(schema.field_with_name(EX_ADD_FACTORS_COLUMN).is_ok()); assert!(schema.field_with_name(EX_SCALE_FACTORS_COLUMN).is_ok()); diff --git a/rust/lance-index/src/vector/storage.rs b/rust/lance-index/src/vector/storage.rs index 8c091402687..a14308197ed 100644 --- a/rust/lance-index/src/vector/storage.rs +++ b/rust/lance-index/src/vector/storage.rs @@ -251,7 +251,10 @@ pub struct RabitRawQueryContext { pub ex_bits: u8, pub rotated_query: Vec, pub dist_table: Vec, - pub ex_dist_table: Vec, + /// The rotated query zero-padded to a 64-dim multiple for the ex-dot + /// kernels; empty when `code_dim` is already aligned (the kernels then + /// read `rotated_query` directly). + pub ex_query: Vec, pub sum_q: f32, } diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 29d9e224970..202a4423d49 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -39,8 +39,9 @@ use lance_index::frag_reuse::FragReuseIndex; use lance_index::metrics::{LocalMetricsCollector, MetricsCollector, NoOpMetricsCollector}; use lance_index::vector::VectorIndexCacheEntry; use lance_index::vector::bq::builder::RabitQuantizer; +use lance_index::vector::bq::ex_dot::{blocked_ex_code_bytes, padded_query_len}; +use lance_index::vector::bq::rabit_ex_bits; use lance_index::vector::bq::storage::{RabitQueryEstimator, SEGMENT_NUM_CODES}; -use lance_index::vector::bq::{rabit_ex_bits, rabit_ex_code_bytes}; use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex, FlatQuantizer}; use lance_index::vector::graph::OrderedNode; use lance_index::vector::hnsw::HNSW; @@ -153,16 +154,18 @@ fn rotated_partition_centroid_slice( cache.rotated_centroids.get(start..end) } -fn rabit_ex_dist_table_len(dim: usize, num_bits: u8) -> usize { - rabit_ex_bits(num_bits) - .map(|ex_bits| { - if ex_bits == 0 { - 0 - } else { - dim * (1usize << usize::from(ex_bits)) - } - }) - .unwrap_or(dim * 256) +/// `f32` scratch needed for the ex-bit query state: a zero-padded query copy +/// when the rotated dim is not a multiple of the 64-dim kernel block (the +/// FastScan ex LUT is built directly from the query, with no f32 table). +fn rabit_ex_scratch_len(dim: usize, num_bits: u8) -> usize { + let multi_bit = rabit_ex_bits(num_bits) + .map(|ex_bits| ex_bits > 0) + .unwrap_or(true); + if !multi_bit || dim.is_multiple_of(64) { + 0 + } else { + padded_query_len(dim) + } } fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { @@ -170,7 +173,7 @@ fn rabit_u8_scratch_len(dim: usize, num_bits: u8) -> usize { let ex_dist_table_len = rabit_ex_bits(num_bits) .ok() .and_then(|ex_bits| match ex_bits { - 2 | 4 | 8 => rabit_ex_code_bytes(dim, ex_bits).ok(), + 2 | 4 | 8 => Some(blocked_ex_code_bytes(dim, ex_bits)), _ => None, }) .map(|ex_code_len| ex_code_len * 2 * SEGMENT_NUM_CODES) @@ -184,12 +187,12 @@ fn rabit_query_scratch_capacity( num_bits: u8, ) -> QueryScratchCapacity { let dist_table_len = dim * 4; - let ex_dist_table_len = rabit_ex_dist_table_len(dim, num_bits); + let ex_scratch_len = rabit_ex_scratch_len(dim, num_bits); let u8_scratch_len = rabit_u8_scratch_len(dim, num_bits); QueryScratchCapacity::new( max_partition_len, - dim + dist_table_len + ex_dist_table_len, + dim + dist_table_len + ex_scratch_len, max_partition_len.max(dist_table_len), u8_scratch_len, ) @@ -1950,7 +1953,8 @@ mod tests { use lance_arrow::FixedSizeListArrayExt; use lance_index::vector::bq::{ RQBuildParams, RQRotationType, - storage::{RABIT_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator}, + ex_dot::{blocked_ex_code_bytes, padded_query_len}, + storage::{RABIT_BLOCKED_EX_CODE_COLUMN, RabitQuantizationMetadata, RabitQueryEstimator}, transform::{EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN}, }; use lance_index::vector::storage::VectorStore; @@ -2025,14 +2029,17 @@ mod tests { } #[test] - fn test_rabit_ex_dist_table_len_uses_num_bits() { + fn test_rabit_ex_scratch_len_uses_num_bits() { + // Block-aligned dims read the rotated query in place. let dim = 960; + for num_bits in [1, 3, 5, 7, 9] { + assert_eq!(super::rabit_ex_scratch_len(dim, num_bits), 0); + } - assert_eq!(super::rabit_ex_dist_table_len(dim, 1), 0); - assert_eq!(super::rabit_ex_dist_table_len(dim, 3), dim * 4); - assert_eq!(super::rabit_ex_dist_table_len(dim, 5), dim * 16); - assert_eq!(super::rabit_ex_dist_table_len(dim, 7), dim * 64); - assert_eq!(super::rabit_ex_dist_table_len(dim, 9), dim * 256); + // Unaligned multi-bit queries add one padded query copy. + let dim = 968; + assert_eq!(super::rabit_ex_scratch_len(dim, 1), 0); + assert_eq!(super::rabit_ex_scratch_len(dim, 7), padded_query_len(dim)); } #[test] @@ -2054,7 +2061,7 @@ mod tests { let capacity = super::rabit_query_scratch_capacity(dim, max_partition_len, 5); assert_eq!(capacity.distances, max_partition_len); - assert_eq!(capacity.query_f32, dim + dim * 4 + dim * 16); + assert_eq!(capacity.query_f32, dim + dim * 4); assert_eq!(capacity.u16, max_partition_len); assert_eq!(capacity.u8, dim * 16); assert_eq!(capacity.u32, 0); @@ -4445,18 +4452,24 @@ mod tests { } #[rstest] - #[case::l2(DistanceType::L2)] - #[case::cosine(DistanceType::Cosine)] + #[case::l2(DistanceType::L2, 9)] + #[case::cosine(DistanceType::Cosine, 9)] + // ex_bits=3 and ex_bits=5 have no FastScan support and use the bit-plane + // repack, so these searches go through the exact ex-dot rerank kernels + // end to end. + #[case::l2_plane_repack_3bit(DistanceType::L2, 4)] + #[case::l2_plane_repack_5bit(DistanceType::L2, 6)] #[tokio::test] async fn test_build_ivf_rq_multi_bit_persists_split_codes_and_searches( #[case] distance_type: DistanceType, + #[case] num_bits: u8, ) { let test_dir = TempStrDir::default(); let test_uri = test_dir.as_str(); let (mut dataset, vectors) = generate_test_dataset::(test_uri, 0.0..1.0).await; let ivf_params = IvfBuildParams::new(4); - let rq_params = RQBuildParams::with_rotation_type(9, RQRotationType::Fast); + let rq_params = RQBuildParams::with_rotation_type(num_bits, RQRotationType::Fast); let params = VectorIndexParams::with_ivf_rq_params(distance_type, ivf_params, rq_params); dataset .create_index(&["vector"], IndexType::Vector, None, ¶ms, true) @@ -4469,16 +4482,18 @@ mod tests { let scheduler = ScanScheduler::new(obj_store, SchedulerConfig::default_for_testing()); let index_uuid = indices[0].uuid.to_string(); let rq_meta = get_rq_metadata(&dataset, scheduler.clone(), &index_uuid).await; - assert_eq!(rq_meta.num_bits, 9); + assert_eq!(rq_meta.num_bits, num_bits); assert_eq!(rq_meta.query_estimator, RabitQueryEstimator::RawQuery); let reader = open_rq_aux_reader(&dataset, scheduler, &index_uuid).await; let schema = reader.schema(); - let ex_field = schema.field(RABIT_EX_CODE_COLUMN).unwrap(); + let ex_field = schema.field(RABIT_BLOCKED_EX_CODE_COLUMN).unwrap(); let DataType::FixedSizeList(_, ex_code_bytes) = ex_field.data_type() else { panic!("RQ ex-code field should be FixedSizeList"); }; - assert_eq!(ex_code_bytes, 32); + let expected_ex_code_bytes = + blocked_ex_code_bytes(rq_meta.rotated_dim(), num_bits - 1) as i32; + assert_eq!(ex_code_bytes, expected_ex_code_bytes); assert!(schema.field(EX_ADD_FACTORS_COLUMN).is_some()); assert!(schema.field(EX_SCALE_FACTORS_COLUMN).is_some()); From 5ca58106e27622e99e92a153513a61022ac81fc9 Mon Sep 17 00:00:00 2001 From: vinoyang Date: Fri, 12 Jun 2026 16:45:54 +0800 Subject: [PATCH 092/177] feat(rust): add cleanup explain API (#7147) ## Summary - Add `Dataset::cleanup(policy)` with two terminal actions: - `explain()` returns a read-only `CleanupExplanation` - `execute()` re-evaluates current dataset/ref state before deleting files - Add explanation details including read version, aggregate removal stats, candidate files, truncation metadata, referenced branch details, and warnings. - Keep existing Rust execution APIs (`cleanup_old_versions`, `cleanup_with_policy`) compatible. ## Tests - `cargo fmt --all` - `cargo test -p lance dataset::cleanup::tests -- --nocapture` - `cargo check -p lance --tests` `cargo clippy -p lance --all-targets -- -D warnings` was attempted, but currently fails on unrelated pre-existing lint errors outside this change. --- rust/lance/src/dataset.rs | 16 +- rust/lance/src/dataset/cleanup.rs | 773 ++++++++++++++++++++++++------ 2 files changed, 645 insertions(+), 144 deletions(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index f6fd1ef6a20..23d824fd6fd 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -24,8 +24,7 @@ use lance_core::datatypes::{OnMissing, OnTypeMismatch, Projectable, Projection}; use lance_core::traits::DatasetTakeRows; use lance_core::utils::address::RowAddress; use lance_core::utils::tracing::{ - DATASET_CLEANING_EVENT, DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, - TRACE_DATASET_EVENTS, + DATASET_DELETING_EVENT, DATASET_DROPPING_COLUMN_EVENT, TRACE_DATASET_EVENTS, }; use lance_datafusion::projection::ProjectionPlan; use lance_file::datatypes::populate_schema_dictionary; @@ -104,7 +103,7 @@ use self::scanner::{DatasetRecordBatchStream, Scanner}; use self::transaction::{Operation, Transaction, TransactionBuilder, UpdateMapEntry}; use self::write::{cleanup_data_fragments, write_fragments_internal}; use crate::dataset::branch_location::BranchLocation; -use crate::dataset::cleanup::{CleanupPolicy, CleanupPolicyBuilder}; +use crate::dataset::cleanup::{CleanupOperation, CleanupPolicy, CleanupPolicyBuilder}; use crate::dataset::refs::{BranchContents, BranchIdentifier, Branches, Tags}; use crate::dataset::sql::SqlQueryBuilder; use crate::datatypes::Schema; @@ -1286,8 +1285,15 @@ impl Dataset { &self, policy: CleanupPolicy, ) -> BoxFuture<'_, Result> { - info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.uri); - cleanup::cleanup_old_versions(self, policy).boxed() + async move { self.cleanup(policy).execute().await }.boxed() + } + + /// Creates a cleanup operation for this dataset. + /// + /// The returned operation can be explained without deleting files, or + /// executed to re-evaluate the current dataset state and remove files. + pub fn cleanup(&self, policy: CleanupPolicy) -> CleanupOperation<'_> { + CleanupOperation::new(self, policy) } #[allow(clippy::too_many_arguments)] diff --git a/rust/lance/src/dataset/cleanup.rs b/rust/lance/src/dataset/cleanup.rs index b3ca60cfa0f..65928038cea 100644 --- a/rust/lance/src/dataset/cleanup.rs +++ b/rust/lance/src/dataset/cleanup.rs @@ -46,7 +46,8 @@ use lance_core::{ Error, Result, utils::tracing::{ AUDIT_MODE_DELETE, AUDIT_MODE_DELETE_UNVERIFIED, AUDIT_TYPE_DATA, AUDIT_TYPE_DELETION, - AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT, + AUDIT_TYPE_INDEX, AUDIT_TYPE_MANIFEST, DATASET_CLEANING_EVENT, TRACE_DATASET_EVENTS, + TRACE_FILE_AUDIT, }, }; use lance_table::{ @@ -78,7 +79,7 @@ struct ReferencedFiles { index_uuids: HashSet, } -#[derive(Clone, Debug, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct RemovalStats { pub bytes_removed: u64, pub old_versions: u64, @@ -88,12 +89,194 @@ pub struct RemovalStats { pub deletion_files_removed: u64, } -#[derive(Clone, Copy, Debug)] -enum RemovedFileType { +/// A read-only explanation of what a cleanup operation would remove. +/// +/// This is an explanation, not a deletion plan. Calling +/// [`CleanupOperation::execute`] re-evaluates the current dataset and reference +/// state before deleting files. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupExplanation { + /// Dataset version observed when the explanation was produced. + pub read_version: u64, + /// Aggregate statistics for files that would be removed. + pub stats: RemovalStats, + /// Candidate files that would be removed, capped by `candidate_file_limit`. + pub candidate_files: Vec, + /// True if more candidate files were found than are included. + pub candidate_files_truncated: bool, + /// Maximum number of candidate files included in this explanation. + pub candidate_file_limit: usize, + /// Referenced child branches and whether cleanup would cascade into them. + pub referenced_branches: Vec, + /// Non-fatal warnings about the explanation. + pub warnings: Vec, +} + +/// A file that cleanup identified as removable. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupCandidateFile { + /// Dataset-relative or storage path for the candidate file. + pub path: String, + /// Kind of file identified by cleanup. + pub kind: CleanupFileKind, + /// True if the file is removable only because it aged past the unverified + /// retention threshold or `delete_unverified` is enabled. + pub unverified: bool, + /// Candidate file size in bytes. + pub size_bytes: u64, +} + +/// A branch that references the current branch lineage. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct CleanupReferencedBranch { + /// Branch name. + pub name: String, + /// Version of the current lineage referenced by this branch. + pub referenced_version: u64, + /// True if this branch would be cleaned when cascading cleanup is enabled. + pub cleanup_candidate: bool, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CleanupFileKind { + Manifest, Data, Transaction, Index, Deletion, + /// A leftover `_versions/.tmp` manifest from a failed transaction. These + /// are deleted but excluded from per-kind `RemovalStats` counts and audit + /// logs to match the long-standing cleanup behavior. Their bytes + /// are still included in `bytes_removed`. + TemporaryManifest, +} + +impl CleanupCandidateFile { + fn from_cleanup_file(file: &CleanupFile) -> Self { + Self { + path: file.path.to_string(), + kind: file.kind, + unverified: file.unverified, + size_bytes: file.size_bytes, + } + } +} + +fn cleanup_file( + path: Path, + kind: CleanupFileKind, + unverified: bool, + size_bytes: u64, +) -> Option { + Some(CleanupFile { + path, + kind, + unverified, + size_bytes, + }) +} + +#[derive(Clone, Debug)] +struct CleanupFile { + path: Path, + kind: CleanupFileKind, + /// True when the file was kept on disk past its referenced lifetime + /// because we could not verify it was safe to remove (e.g. produced by an + /// unfinished commit) and is being deleted only because it has aged past + /// the unverified-retention threshold or `delete_unverified` is set. + unverified: bool, + size_bytes: u64, +} + +impl RemovalStats { + fn record_file(&mut self, file: &CleanupFile) { + self.bytes_removed += file.size_bytes; + match file.kind { + CleanupFileKind::Manifest => self.old_versions += 1, + CleanupFileKind::Data => self.data_files_removed += 1, + CleanupFileKind::Transaction => self.transaction_files_removed += 1, + CleanupFileKind::Index => self.index_files_removed += 1, + CleanupFileKind::Deletion => self.deletion_files_removed += 1, + CleanupFileKind::TemporaryManifest => {} + } + } + + fn merge(&mut self, other: &Self) { + self.bytes_removed += other.bytes_removed; + self.old_versions += other.old_versions; + self.data_files_removed += other.data_files_removed; + self.transaction_files_removed += other.transaction_files_removed; + self.index_files_removed += other.index_files_removed; + self.deletion_files_removed += other.deletion_files_removed; + } +} + +#[derive(Debug, Default)] +struct CleanupRunResult { + stats: RemovalStats, + removed_manifests: HashSet, + candidate_files: Vec, + candidate_files_truncated: bool, + referenced_branches: Vec, +} + +impl CleanupRunResult { + fn record_file( + &mut self, + file: &CleanupFile, + candidate_file_limit: Option, + track_removed_manifests: bool, + ) { + self.stats.record_file(file); + if track_removed_manifests && matches!(file.kind, CleanupFileKind::Manifest) { + self.removed_manifests.insert(file.path.clone()); + } + if let Some(limit) = candidate_file_limit { + if self.candidate_files.len() < limit { + self.candidate_files + .push(CleanupCandidateFile::from_cleanup_file(file)); + } else { + self.candidate_files_truncated = true; + } + } + } + + fn merge(&mut self, other: Self, candidate_file_limit: Option) { + self.stats.merge(&other.stats); + self.removed_manifests.extend(other.removed_manifests); + self.referenced_branches.extend(other.referenced_branches); + if let Some(limit) = candidate_file_limit { + for file in other.candidate_files { + if self.candidate_files.len() < limit { + self.candidate_files.push(file); + } else { + self.candidate_files_truncated = true; + } + } + self.candidate_files_truncated |= other.candidate_files_truncated; + } + } +} + +#[derive(Clone, Copy, Debug)] +enum CleanupAction { + Execute, + Explain { max_candidate_files: usize }, +} + +impl CleanupAction { + fn deletes_files(self) -> bool { + matches!(self, Self::Execute) + } + + fn candidate_file_limit(self) -> Option { + match self { + Self::Execute => None, + Self::Explain { + max_candidate_files, + } => Some(max_candidate_files), + } + } } fn remove_prefix(path: &Path, prefix: &Path) -> Path { @@ -108,6 +291,11 @@ fn remove_prefix(path: &Path, prefix: &Path) -> Path { struct CleanupTask<'a> { dataset: &'a Dataset, policy: CleanupPolicy, + action: CleanupAction, + read_version: u64, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, } /// Information about the dataset that we learn by inspecting all of the manifests @@ -131,21 +319,131 @@ struct CleanupInspection { const UNVERIFIED_THRESHOLD_DAYS: i64 = 7; const S3_DELETE_STREAM_BATCH_SIZE: u64 = 1_000; const AZURE_DELETE_STREAM_BATCH_SIZE: u64 = 256; +const DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES: usize = 1_000; + +/// Builder-style cleanup operation. +/// +/// Call [`Self::explain`] for a read-only explanation of what cleanup would +/// remove, or [`Self::execute`] to re-evaluate the current dataset state and +/// delete files. +pub struct CleanupOperation<'a> { + dataset: &'a Dataset, + policy: CleanupPolicy, + max_candidate_files: usize, +} + +impl<'a> CleanupOperation<'a> { + pub(crate) fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { + Self { + dataset, + policy, + max_candidate_files: DEFAULT_EXPLANATION_MAX_CANDIDATE_FILES, + } + } + + /// Set the maximum number of candidate files included in explanations. + /// + /// The aggregate [`RemovalStats`] in [`CleanupExplanation`] still include + /// all files that would be removed. + pub fn with_max_candidate_files(mut self, max_candidate_files: usize) -> Self { + self.max_candidate_files = max_candidate_files; + self + } + + /// Explain what cleanup would remove without deleting files. + pub async fn explain(&self) -> Result { + let cleanup = CleanupTask::new( + self.dataset, + self.policy.clone(), + CleanupAction::Explain { + max_candidate_files: self.max_candidate_files, + }, + ); + let read_version = cleanup.read_version; + let result = cleanup.run().await?; + let warnings = if result.candidate_files_truncated { + vec![format!( + "candidate_files truncated to {} entries", + self.max_candidate_files + )] + } else { + Vec::new() + }; + Ok(CleanupExplanation { + read_version, + stats: result.stats, + candidate_files: result.candidate_files, + candidate_files_truncated: result.candidate_files_truncated, + candidate_file_limit: self.max_candidate_files, + referenced_branches: result.referenced_branches, + warnings, + }) + } + + /// Execute cleanup by re-evaluating the current dataset state. + pub async fn execute(&self) -> Result { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&self.dataset.uri); + let cleanup = CleanupTask::new(self.dataset, self.policy.clone(), CleanupAction::Execute); + Ok(cleanup.run().await?.stats) + } +} impl<'a> CleanupTask<'a> { - fn new(dataset: &'a Dataset, policy: CleanupPolicy) -> Self { - Self { dataset, policy } + fn new(dataset: &'a Dataset, policy: CleanupPolicy, action: CleanupAction) -> Self { + let track_removed_manifests = policy.clean_referenced_branches; + let include_referenced_branches = action.candidate_file_limit().is_some(); + Self::new_with_ignored_manifests( + dataset, + policy, + action, + HashSet::new(), + track_removed_manifests, + include_referenced_branches, + ) + } + + fn new_with_ignored_manifests( + dataset: &'a Dataset, + policy: CleanupPolicy, + action: CleanupAction, + ignored_manifests: HashSet, + track_removed_manifests: bool, + include_referenced_branches: bool, + ) -> Self { + Self { + dataset, + policy, + action, + read_version: dataset.version().version, + ignored_manifests, + track_removed_manifests, + include_referenced_branches, + } } - async fn run(self) -> Result { - let mut final_stats = RemovalStats::default(); + async fn run(self) -> Result { + let mut final_result = CleanupRunResult::default(); + let candidate_file_limit = self.action.candidate_file_limit(); // First check if we need to clean referenced branches // For cases that referenced branches never clean and the current cleanup cannot clean anything // This must happen before cleaning the current branch if the setting is enabled. let referenced_branches: Vec<(String, u64)> = self.find_referenced_branches().await?; + if self.include_referenced_branches { + final_result.referenced_branches = referenced_branches + .iter() + .map(|(name, referenced_version)| CleanupReferencedBranch { + name: name.clone(), + referenced_version: *referenced_version, + cleanup_candidate: self.policy.clean_referenced_branches, + }) + .collect(); + } if self.policy.clean_referenced_branches { - self.clean_referenced_branches(&referenced_branches).await?; + final_result.merge( + self.clean_referenced_branches(&referenced_branches).await?, + candidate_file_limit, + ); } // we process all manifest files in parallel to figure @@ -179,19 +477,21 @@ impl<'a> CleanupTask<'a> { } if !referenced_branches.is_empty() { + let ignored_manifests: HashSet<_> = final_result + .removed_manifests + .union(&self.ignored_manifests) + .cloned() + .collect(); inspection = self - .retain_branch_lineage_files(inspection, &referenced_branches) + .retain_branch_lineage_files(inspection, &referenced_branches, &ignored_manifests) .await? }; - let stats = self.delete_unreferenced_files(inspection).await?; - final_stats.bytes_removed += stats.bytes_removed; - final_stats.old_versions += stats.old_versions; - final_stats.data_files_removed += stats.data_files_removed; - final_stats.transaction_files_removed += stats.transaction_files_removed; - final_stats.index_files_removed += stats.index_files_removed; - final_stats.deletion_files_removed += stats.deletion_files_removed; - Ok(final_stats) + final_result.merge( + self.delete_unreferenced_files(inspection).await?, + candidate_file_limit, + ); + Ok(final_result) } #[instrument(level = "debug", skip_all)] @@ -203,6 +503,7 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&self.dataset.base, &self.dataset.object_store, false) + .try_filter(|location| future::ready(!self.ignored_manifests.contains(&location.path))) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_manifest_file(location, &inspection, tagged_versions) }) @@ -224,12 +525,10 @@ impl<'a> CleanupTask<'a> { let manifest = read_manifest(&self.dataset.object_store, &location.path, location.size).await?; - let dataset_version = self.dataset.version().version; - // Don't delete the latest version, even if it is old. Don't delete tagged versions, // regardless of age. Don't delete manifests if their version is newer than the dataset // version. These are either in-progress or newly added since we started. - let is_latest = dataset_version <= manifest.version; + let is_latest = self.read_version <= manifest.version; let is_tagged = tagged_versions.contains(&manifest.version); let in_working_set = is_latest || !self.policy.should_clean(&manifest) || is_tagged; let indexes = @@ -319,8 +618,10 @@ impl<'a> CleanupTask<'a> { async fn delete_unreferenced_files( &self, inspection: CleanupInspection, - ) -> Result { - let removal_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let cleanup_result = Mutex::new(CleanupRunResult::default()); + let deletes_files = self.action.deletes_files(); + let candidate_file_limit = self.action.candidate_file_limit(); let verification_threshold = utc_now() - TimeDelta::try_days(UNVERIFIED_THRESHOLD_DAYS).expect("TimeDelta::try_days"); @@ -335,9 +636,8 @@ impl<'a> CleanupTask<'a> { ) }; // Build stream for a managed subtree - let build_listing_stream = |dir: Path, file_type: Option| { + let build_listing_stream = |dir: Path| { let inspection_ref = &inspection; - let removal_stats_ref = &removal_stats; self.dataset .object_store .read_dir_all(&dir, inspection.earliest_retained_manifest_time) @@ -356,118 +656,133 @@ impl<'a> CleanupTask<'a> { // delete it if we can verify it is part of an old version. let maybe_in_progress = !self.policy.delete_unverified && obj_meta.last_modified >= verification_threshold; - let path_to_remove = self.path_if_not_referenced( - obj_meta.location, + let file_to_remove = self.cleanup_file_if_not_referenced( + obj_meta, maybe_in_progress, inspection_ref, ); - if matches!(path_to_remove, Ok(Some(..))) { - let mut stats = removal_stats_ref.lock().unwrap(); - stats.bytes_removed += obj_meta.size; - if let Some(file_type) = file_type { - match file_type { - RemovedFileType::Data => stats.data_files_removed += 1, - RemovedFileType::Transaction => { - stats.transaction_files_removed += 1 - } - RemovedFileType::Index => stats.index_files_removed += 1, - RemovedFileType::Deletion => stats.deletion_files_removed += 1, - } - } - } - future::ready(path_to_remove) + future::ready(file_to_remove) }) .boxed() }; // Restrict scanning to Lance-managed subtrees for safety and performance. let streams = vec![ - build_listing_stream(self.dataset.versions_dir(), None), - build_listing_stream( - self.dataset.transactions_dir(), - Some(RemovedFileType::Transaction), - ), - build_listing_stream(self.dataset.data_dir(), Some(RemovedFileType::Data)), - build_listing_stream(self.dataset.indices_dir(), Some(RemovedFileType::Index)), - build_listing_stream( - self.dataset.deletions_dir(), - Some(RemovedFileType::Deletion), - ), + build_listing_stream(self.dataset.versions_dir()), + build_listing_stream(self.dataset.transactions_dir()), + build_listing_stream(self.dataset.data_dir()), + build_listing_stream(self.dataset.indices_dir()), + build_listing_stream(self.dataset.deletions_dir()), ]; - let unreferenced_paths = stream::iter(streams).flatten().boxed(); + let unreferenced_files = stream::iter(streams).flatten().boxed(); let old_manifests = inspection.old_manifests.clone(); - let num_old_manifests = old_manifests.len(); - - // Ideally this collect shouldn't be needed here but it seems necessary - // to avoid https://github.com/rust-lang/rust/issues/102211 - let manifest_bytes_removed = stream::iter(old_manifests.keys()) - .map(|path| self.dataset.object_store.size(path)) - .collect::>() - .await; - let manifest_bytes_removed = stream::iter(manifest_bytes_removed) - .buffer_unordered(self.dataset.object_store.io_parallelism()) - .try_fold(0, |acc, size| async move { Ok(acc + (size)) }) - .await; - - let old_manifests_stream = stream::iter(old_manifests.into_keys()) - .map(|path| { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path.as_ref()); - Ok(path) + let manifest_files = stream::iter(old_manifests) + .map(|(path, _version)| async move { + let size_bytes = self.dataset.object_store.size(&path).await?; + Ok::(CleanupFile { + path, + kind: CleanupFileKind::Manifest, + unverified: false, + size_bytes, + }) }) + .buffer_unordered(self.dataset.object_store.io_parallelism()) .boxed(); - let all_paths_to_remove = - stream::iter(vec![unreferenced_paths, old_manifests_stream]).flatten(); - - let paths_to_delete: BoxStream> = if let Some(rate) = - self.policy.delete_rate_limit - { - let duration = calculate_duration(self.dataset.object_store.scheme().to_string(), rate); - let mut ticker = interval(duration); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); - IntervalStream::new(ticker) - .zip(all_paths_to_remove) - .map(|(_, path)| path) - .boxed() - } else { - all_paths_to_remove.boxed() - }; - let delete_fut = self - .dataset - .object_store - .remove_stream(paths_to_delete) - .try_for_each(|_| future::ready(Ok(()))); + let all_files = stream::iter(vec![unreferenced_files, manifest_files]).flatten(); + let all_paths_to_remove = all_files.map(|file| { + let file = file?; + if deletes_files { + let mode = if file.unverified { + AUDIT_MODE_DELETE_UNVERIFIED + } else { + AUDIT_MODE_DELETE + }; + let path_str = file.path.as_ref(); + match file.kind { + CleanupFileKind::Manifest => { + info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = path_str); + } + CleanupFileKind::Data => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DATA, path = path_str); + } + CleanupFileKind::Deletion => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_DELETION, path = path_str); + } + CleanupFileKind::Index => { + info!(target: TRACE_FILE_AUDIT, mode=mode, r#type=AUDIT_TYPE_INDEX, path = path_str); + } + CleanupFileKind::Transaction | CleanupFileKind::TemporaryManifest => {} + } + } + cleanup_result + .lock() + .unwrap() + .record_file(&file, candidate_file_limit, self.track_removed_manifests); + Ok(file.path) + }); + + if deletes_files { + let paths_to_delete: BoxStream> = + if let Some(rate) = self.policy.delete_rate_limit { + let duration = + calculate_duration(self.dataset.object_store.scheme().to_string(), rate); + let mut ticker = interval(duration); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + IntervalStream::new(ticker) + .zip(all_paths_to_remove) + .map(|(_, path)| path) + .boxed() + } else { + all_paths_to_remove.boxed() + }; - delete_fut.await?; + self.dataset + .object_store + .remove_stream(paths_to_delete) + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } else { + // Drain the stream to populate stats, but do not call remove_stream. + all_paths_to_remove + .try_for_each(|_| future::ready(Ok(()))) + .await?; + } - let mut removal_stats = removal_stats.into_inner().unwrap(); - removal_stats.old_versions = num_old_manifests as u64; - removal_stats.bytes_removed += manifest_bytes_removed?; + let cleanup_result = cleanup_result.into_inner().unwrap(); let span = Span::current(); - span.record("bytes_removed", removal_stats.bytes_removed); - span.record("data_files_removed", removal_stats.data_files_removed); + span.record("bytes_removed", cleanup_result.stats.bytes_removed); + span.record( + "data_files_removed", + cleanup_result.stats.data_files_removed, + ); span.record( "transaction_files_removed", - removal_stats.transaction_files_removed, + cleanup_result.stats.transaction_files_removed, + ); + span.record( + "index_files_removed", + cleanup_result.stats.index_files_removed, ); - span.record("index_files_removed", removal_stats.index_files_removed); span.record( "deletion_files_removed", - removal_stats.deletion_files_removed, + cleanup_result.stats.deletion_files_removed, ); - Ok(removal_stats) + Ok(cleanup_result) } - fn path_if_not_referenced( + fn cleanup_file_if_not_referenced( &self, - path: Path, + obj_meta: ObjectMeta, maybe_in_progress: bool, inspection: &CleanupInspection, - ) -> Result> { + ) -> Result> { + let path = obj_meta.location; let relative_path = remove_prefix(&path, &self.dataset.base); + let size_bytes = obj_meta.size; if relative_path.as_ref().starts_with("_versions/.tmp") { // This is a temporary manifest file. // @@ -476,7 +791,12 @@ impl<'a> CleanupTask<'a> { if maybe_in_progress { return Ok(None); } else { - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::TemporaryManifest, + true, + size_bytes, + )); } } if relative_path.as_ref().starts_with("_indices") { @@ -490,15 +810,18 @@ impl<'a> CleanupTask<'a> { { return Ok(None); } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file(path, CleanupFileKind::Index, true, size_bytes)); } else if inspection .verified_files .index_uuids .contains(uuid.as_ref()) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_INDEX, path = path.to_string()); - return Ok(Some(path)); + return Ok(cleanup_file( + path, + CleanupFileKind::Index, + false, + size_bytes, + )); } } else { return Ok(None); @@ -514,15 +837,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -587,15 +908,13 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, true, size_bytes)) } else if inspection .verified_files .data_paths .contains(&parent_data_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DATA, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file(path, CleanupFileKind::Data, false, size_bytes)) } else { Ok(None) } @@ -613,15 +932,23 @@ impl<'a> CleanupTask<'a> { { Ok(None) } else if !maybe_in_progress { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE_UNVERIFIED, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + true, + size_bytes, + )) } else if inspection .verified_files .delete_paths .contains(&relative_path) { - info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_DELETION, path = path.to_string()); - Ok(Some(path)) + Ok(cleanup_file( + path, + CleanupFileKind::Deletion, + false, + size_bytes, + )) } else { Ok(None) } @@ -640,7 +967,14 @@ impl<'a> CleanupTask<'a> { } else if !maybe_in_progress || inspection.verified_files.tx_paths.contains(&relative_path) { - Ok(Some(path)) + let unverified = + !inspection.verified_files.tx_paths.contains(&relative_path); + Ok(cleanup_file( + path, + CleanupFileKind::Transaction, + unverified, + size_bytes, + )) } else { Ok(None) } @@ -709,8 +1043,8 @@ impl<'a> CleanupTask<'a> { async fn clean_referenced_branches( &self, referenced_branches: &[(String, u64)], - ) -> Result { - let final_stats = Mutex::new(RemovalStats::default()); + ) -> Result { + let final_result = Mutex::new(CleanupRunResult::default()); // Group branches by their lineage identifier (BranchIdentifier). // Branches with the same identifier share a lineage and must be cleaned sequentially @@ -722,30 +1056,32 @@ impl<'a> CleanupTask<'a> { .or_insert_with(Vec::new) .push(branch.clone()); } + let action = self.action; + let candidate_file_limit = self.action.candidate_file_limit(); let tasks: Vec<_> = branches_chains .values() .map(|branch_chain| { - let final_stats = &final_stats; + let final_result = &final_result; async move { for branch in branch_chain { let branch_dataset = self .dataset .checkout_version((branch.as_str(), None)) .await?; - if let Some(stats) = cleanup_cascade_branch( + let ignored_manifests = + final_result.lock().unwrap().removed_manifests.clone(); + if let Some(result) = cleanup_cascade_branch_run( &branch_dataset, branch_dataset.manifest.as_ref(), + action, + ignored_manifests, ) .await? { - let mut stats_guard = final_stats.lock().unwrap(); - stats_guard.bytes_removed += stats.bytes_removed; - stats_guard.old_versions += stats.old_versions; - stats_guard.data_files_removed += stats.data_files_removed; - stats_guard.transaction_files_removed += - stats.transaction_files_removed; - stats_guard.index_files_removed += stats.index_files_removed; - stats_guard.deletion_files_removed += stats.deletion_files_removed; + final_result + .lock() + .unwrap() + .merge(result, candidate_file_limit); } } Ok::<(), Error>(()) @@ -753,7 +1089,7 @@ impl<'a> CleanupTask<'a> { }) .collect(); try_join_all(tasks).await?; - Ok(final_stats.into_inner().unwrap()) + Ok(final_result.into_inner().unwrap()) } // Retain manifests containing files referenced by descendant branches. @@ -762,6 +1098,7 @@ impl<'a> CleanupTask<'a> { &self, inspection: CleanupInspection, referenced_branches: &[(String, u64)], + removed_branch_manifests: &HashSet, ) -> Result { let inspection = Mutex::new(inspection); for (branch, root_version_number) in referenced_branches { @@ -772,6 +1109,9 @@ impl<'a> CleanupTask<'a> { self.dataset .commit_handler .list_manifest_locations(&branch_location.path, &self.dataset.object_store, false) + .try_filter(|location| { + future::ready(!removed_branch_manifests.contains(&location.path)) + }) .try_for_each_concurrent(self.dataset.object_store.io_parallelism(), |location| { self.process_branch_referenced_manifests( location, @@ -1020,8 +1360,7 @@ pub async fn cleanup_old_versions( dataset: &Dataset, policy: CleanupPolicy, ) -> Result { - let cleanup = CleanupTask::new(dataset, policy); - cleanup.run().await + CleanupOperation::new(dataset, policy).execute().await } /// If the dataset config has `lance.auto_cleanup` parameters set, @@ -1048,11 +1387,35 @@ pub async fn cleanup_cascade_branch( dataset: &Dataset, manifest: &Manifest, ) -> Result> { + Ok( + cleanup_cascade_branch_run(dataset, manifest, CleanupAction::Execute, HashSet::new()) + .await? + .map(|result| result.stats), + ) +} + +async fn cleanup_cascade_branch_run( + dataset: &Dataset, + manifest: &Manifest, + action: CleanupAction, + ignored_manifests: HashSet, +) -> Result> { let policy = build_cleanup_policy(dataset, manifest).await?; if let Some(mut policy) = policy { policy.clean_referenced_branches = false; policy.error_if_tagged_old_versions = false; - Ok(Some(dataset.cleanup_with_policy(policy).await?)) + if action.deletes_files() { + info!(target: TRACE_DATASET_EVENTS, event=DATASET_CLEANING_EVENT, uri=&dataset.uri); + } + let cleanup = CleanupTask::new_with_ignored_manifests( + dataset, + policy, + action, + ignored_manifests, + true, + false, + ); + Ok(Some(cleanup.run().await?)) } else { Ok(None) } @@ -1443,6 +1806,14 @@ mod tests { cleanup_old_versions(&db, policy).await } + async fn explain_cleanup_with_policy( + &self, + policy: CleanupPolicy, + ) -> Result { + let db = self.open().await?; + db.cleanup(policy).explain().await + } + async fn run_cleanup_with_override( &self, before: DateTime, @@ -1670,6 +2041,51 @@ mod tests { assert_gt!(after_count.num_tx_files, 0); } + #[tokio::test] + async fn explain_cleanup_does_not_delete_files() { + let fixture = MockDatasetFixture::try_new().unwrap(); + fixture.create_some_data().await.unwrap(); + MockClock::set_system_time(TimeDelta::try_seconds(1).unwrap().to_std().unwrap()); + fixture.overwrite_some_data().await.unwrap(); + + let before_count = fixture.count_files().await.unwrap(); + let policy = CleanupPolicyBuilder::default() + .before_timestamp(utc_now()) + .build(); + + let explanation = fixture + .explain_cleanup_with_policy(policy.clone()) + .await + .unwrap(); + let after_preview_count = fixture.count_files().await.unwrap(); + + // Files are not actually removed when explaining cleanup. + assert_eq!(before_count, after_preview_count); + assert_eq!(explanation.read_version, 2); + assert_eq!(explanation.stats.old_versions, 1); + assert_eq!(explanation.stats.data_files_removed, 1); + assert_eq!(explanation.stats.transaction_files_removed, 1); + assert_gt!(explanation.stats.bytes_removed, 0); + assert!(!explanation.candidate_files.is_empty()); + assert!(!explanation.candidate_files_truncated); + + // Running cleanup with the same policy should remove the same files the + // explanation reported for this unchanged dataset. + let removed = fixture.run_cleanup_with_policy(policy).await.unwrap(); + let after_cleanup_count = fixture.count_files().await.unwrap(); + + assert_eq!( + removed.bytes_removed, + before_count.num_bytes - after_cleanup_count.num_bytes + ); + assert_eq!(removed.old_versions, explanation.stats.old_versions); + assert_eq!( + removed.data_files_removed, + explanation.stats.data_files_removed + ); + assert_eq!(removed.bytes_removed, explanation.stats.bytes_removed); + } + #[tokio::test] async fn cleanup_blob_v2_sidecar_files() { let fixture = MockDatasetFixture::try_new().unwrap(); @@ -3073,6 +3489,17 @@ mod tests { self.run_cleanup_inner(policy).await } + async fn explain_cleanup_with_referenced_branches(&mut self) -> Result { + let policy = CleanupPolicyBuilder::default() + .error_if_tagged_old_versions(false) + .clean_referenced_branches(true) + .retain_n_versions(&self.dataset, 1) + .await? + .build(); + self.dataset.checkout_latest().await?; + self.dataset.cleanup(policy).explain().await + } + async fn run_cleanup_inner(&mut self, policy: CleanupPolicy) -> Result { let pre_count = self.count_data().await?; self.dataset.checkout_latest().await?; @@ -3653,6 +4080,74 @@ mod tests { setup.assert_unchanged(&["branch4"]).await; } + #[tokio::test] + async fn explain_cleanup_with_referenced_branches_matches_cleanup() { + let mut setup = build_lineage_datasets().await.unwrap(); + + setup.enable_auto_cleanup().await.unwrap(); + setup.main.write_data().await.unwrap(); + setup.main.compact().await.unwrap(); + setup.branch4.compact().await.unwrap(); + setup.branch1.write_data().await.unwrap(); + setup.branch1.compact().await.unwrap(); + setup.branch2.write_data().await.unwrap(); + setup.branch2.compact().await.unwrap(); + setup.branch3.write_data().await.unwrap(); + setup.branch3.compact().await.unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + let main_counts_before = setup.main.counts; + let branch1_counts_before = setup.branch1.counts; + let branch2_counts_before = setup.branch2.counts; + let branch3_counts_before = setup.branch3.counts; + let branch4_counts_before = setup.branch4.counts; + + let explanation = setup + .main + .explain_cleanup_with_referenced_branches() + .await + .unwrap(); + + setup.main.refresh().await.unwrap(); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts, main_counts_before); + assert_eq!(setup.branch1.counts, branch1_counts_before); + assert_eq!(setup.branch2.counts, branch2_counts_before); + assert_eq!(setup.branch3.counts, branch3_counts_before); + assert_eq!(setup.branch4.counts, branch4_counts_before); + + let removed = setup + .main + .run_cleanup_with_referenced_branches() + .await + .unwrap(); + + assert!(!explanation.referenced_branches.is_empty()); + assert!( + explanation + .referenced_branches + .iter() + .any(|branch| branch.cleanup_candidate) + ); + assert_eq!(explanation.stats, removed); + setup.branch1.refresh().await.unwrap(); + setup.branch2.refresh().await.unwrap(); + setup.branch3.refresh().await.unwrap(); + setup.branch4.refresh().await.unwrap(); + assert_eq!(setup.main.counts.num_manifest_files, 1); + assert_eq!(setup.branch1.counts.num_manifest_files, 1); + assert_eq!(setup.branch2.counts.num_manifest_files, 1); + assert_eq!(setup.branch3.counts.num_manifest_files, 1); + assert_eq!(setup.branch4.counts.num_manifest_files, 1); + } + #[tokio::test] async fn auto_clean_referenced_branches_with_tags() { let mut setup = build_lineage_datasets().await.unwrap(); From 837dc15f9d9934a072ba268f1039ef2a18eb791b Mon Sep 17 00:00:00 2001 From: yyzhao2025 Date: Fri, 12 Jun 2026 20:34:00 +0800 Subject: [PATCH 093/177] fix(dataset): resolve Blob v2 external URIs and robustly clean failed writes in add_columns (#7152) --- python/python/tests/test_blob.py | 181 ++++ rust/lance/src/dataset/fragment.rs | 2 +- rust/lance/src/dataset/schema_evolution.rs | 986 +++++++++++++++++++-- rust/lance/src/dataset/updater.rs | 40 +- rust/lance/src/dataset/write.rs | 41 + 5 files changed, 1142 insertions(+), 108 deletions(-) diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py index 5a896d21c5d..9a0ad07f637 100644 --- a/python/python/tests/test_blob.py +++ b/python/python/tests/test_blob.py @@ -45,6 +45,56 @@ def _external_blob_table(blob_path, payload=b"hello"): return pa.table({"blob": lance.blob_array([blob_path.as_uri()])}) +def _add_columns_blob_v2_values(tmp_path): + external_base = tmp_path / "external_base" + external_blob = external_base / "external_blob.bin" + external_blob.parent.mkdir(parents=True, exist_ok=True) + external_blob.write_bytes(b"external") + + payloads = [ + b"inline", + b"p" * (64 * 1024 + 1024), + b"d" * (4 * 1024 * 1024 + 1024), + b"external", + ] + values = [payloads[0], payloads[1], payloads[2], external_blob.as_uri()] + initial_bases = [DatasetBasePath(external_base.as_uri(), name="external", id=1)] + return values, payloads, initial_bases + + +def _assert_blob_v2_add_columns_result(dataset, column, payloads): + desc = dataset.to_table(columns=[column]).column(column).chunk(0) + + assert desc.field("kind").to_pylist() == [0, 1, 2, 3] + assert desc.field("blob_id").to_pylist()[3] == 1 + assert desc.field("blob_uri").to_pylist()[3] == "external_blob.bin" + + blobs = dataset.take_blobs(column, indices=range(len(payloads))) + assert [blob.readall() for blob in blobs] == payloads + + +def _dataset_file_set(dataset_path): + return { + path.relative_to(dataset_path) + for path in dataset_path.rglob("*") + if path.is_file() + } + + +def _write_two_fragment_blob_v2_seed_dataset(tmp_path, name): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + dataset_path = tmp_path / name + ds = lance.write_dataset( + pa.table({"id": range(8)}), + dataset_path, + data_storage_version="2.2", + initial_bases=initial_bases, + max_rows_per_file=4, + max_rows_per_group=4, + ) + return ds, dataset_path, values, payloads + + def _out_of_order_blob_selection(dataset_with_blobs, selection_kind): addresses = _blob_row_addresses(dataset_with_blobs) expected = [(addresses[4], b"quux"), (addresses[0], b"foo")] @@ -608,6 +658,137 @@ def test_blob_extension_write_external_ingest_rejects_reference_only_options(tmp ) +def test_blob_extension_add_columns_record_batch_reader_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_reader_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + ds.add_columns(pa.table({"blob": lance.blob_array(values)}).to_reader()) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +@pytest.mark.parametrize( + "failure_mode", + [ + pytest.param("raises_after_first_fragment", id="reader_raises_mid_stream"), + pytest.param("wrong_schema", id="reader_yields_wrong_schema"), + pytest.param("too_many_rows", id="reader_produces_too_many_rows"), + ], +) +def test_blob_extension_add_columns_record_batch_reader_failure_cleans_files( + tmp_path, + failure_mode, +): + ds, dataset_path, values, payloads = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + f"test_add_columns_reader_blob_v2_fail_cleanup_{failure_mode}", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + + schema = pa.schema([lance.blob_field("blob")]) + first_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + second_fragment_batch = pa.record_batch([lance.blob_array(values)], schema=schema) + + if failure_mode == "raises_after_first_fragment": + match = "reader failed after first fragment" + + def failing_reader(): + yield first_fragment_batch + raise RuntimeError("reader failed after first fragment") + + elif failure_mode == "wrong_schema": + match = "field names" + + def failing_reader(): + yield first_fragment_batch + yield pa.record_batch([pa.array(range(4))], ["not_blob"]) + + else: + match = "Stream produced more values than expected for dataset" + + def failing_reader(): + yield first_fragment_batch + yield second_fragment_batch + yield pa.record_batch([lance.blob_array([payloads[0]])], schema=schema) + + with pytest.raises(OSError, match=match): + ds.add_columns(failing_reader(), reader_schema=schema) + + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_failure_cleans_files(tmp_path): + ds, dataset_path, values, _ = _write_two_fragment_blob_v2_seed_dataset( + tmp_path, + "test_add_columns_udf_blob_v2_fail_cleanup", + ) + external_blob_path = tmp_path / "external_base" / "external_blob.bin" + files_before = _dataset_file_set(dataset_path) + call_count = 0 + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def fail_on_second_fragment(batch): + nonlocal call_count + call_count += 1 + if call_count == 2: + raise RuntimeError("udf failed after first fragment") + blob_values = [values[row.as_py() % len(values)] for row in batch["id"]] + return pa.record_batch( + [lance.blob_array(blob_values)], + ["blob"], + ) + + with pytest.raises(OSError, match="udf failed after first fragment"): + ds.add_columns(fail_on_second_fragment, read_columns=["id"], batch_size=4) + + assert call_count == 2 + assert ds.version == 1 + assert _dataset_file_set(dataset_path) == files_before + assert external_blob_path.exists() + + +def test_blob_extension_add_columns_batch_udf_all_kinds(tmp_path): + values, payloads, initial_bases = _add_columns_blob_v2_values(tmp_path) + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_udf_blob_v2", + data_storage_version="2.2", + initial_bases=initial_bases, + ) + + @lance.batch_udf(output_schema=pa.schema([lance.blob_field("blob")])) + def make_blob_column(batch): + return pa.record_batch( + [lance.blob_array([values[row.as_py()] for row in batch["id"]])], + ["blob"], + ) + + ds.add_columns(make_blob_column, read_columns=["id"]) + + _assert_blob_v2_add_columns_result(ds, "blob", payloads) + + +def test_blob_extension_add_columns_all_nulls_blob_v2(tmp_path): + ds = lance.write_dataset( + pa.table({"id": range(4)}), + tmp_path / "test_add_columns_all_nulls_blob_v2", + data_storage_version="2.2", + ) + + ds.add_columns(lance.blob_field("blob")) + + assert ds.to_table(columns=["blob"]).column("blob").to_pylist() == [None] * 4 + assert ds.take_blobs("blob", indices=range(4)) == [] + + def test_blob_extension_write_fragments_external_denied_by_default(tmp_path): blob_path = tmp_path / "external_blob.bin" diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 11851e8846e..eb165e5f612 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1792,7 +1792,7 @@ impl FileFragment { read_columns: Option>, batch_size: Option, ) -> Result<(Fragment, Schema)> { - let (fragments, schema) = schema_evolution::add_columns_to_fragments( + let (fragments, schema, _) = schema_evolution::add_columns_to_fragments( self.dataset.as_ref(), transforms, read_columns, diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index f5d792979df..a2c8f05a89f 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -1,12 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::{collections::HashSet, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use super::fragment::FileFragment; use super::{ Dataset, transaction::{Operation, Transaction}, + write::cleanup_data_fragments, }; use crate::{Error, Result, io::exec::Planner}; use arrow::compute::CastOptions; @@ -239,7 +243,7 @@ pub(super) async fn add_columns_to_fragments( read_columns: Option>, fragments: &[FileFragment], batch_size: Option, -) -> Result<(Vec, Schema)> { +) -> Result<(Vec, Schema, Vec)> { // Check names early (before calling add_columns_impl) to avoid extra work if // the names are wrong. let version = dataset.manifest.data_storage_format.lance_file_version()?; @@ -261,10 +265,10 @@ pub(super) async fn add_columns_to_fragments( } let transforms = optimizer.optimize(dataset, transforms)?; - let (output_schema, fragments) = match transforms { + let (output_schema, new_fragments, fragments_to_cleanup) = match transforms { NewColumnTransform::BatchUDF(udf) => { check_names(udf.output_schema.as_ref())?; - let fragments = add_columns_impl( + let result = add_columns_impl( fragments, read_columns, udf.mapper, @@ -273,7 +277,11 @@ pub(super) async fn add_columns_to_fragments( None, ) .await?; - Result::Ok((udf.output_schema, fragments)) + Result::Ok(( + udf.output_schema, + result.fragments, + result.fragments_to_cleanup, + )) } NewColumnTransform::SqlExpressions(expressions) => { // We just transform the SQL expression into a UDF backed by DataFusion @@ -336,22 +344,22 @@ pub(super) async fn add_columns_to_fragments( let mapper = Box::new(mapper); let read_columns = Some(read_schema.field_names().into_iter().cloned().collect()); - let fragments = + let result = add_columns_impl(fragments, read_columns, mapper, batch_size, None, None).await?; - Ok((output_schema, fragments)) + Ok((output_schema, result.fragments, result.fragments_to_cleanup)) } NewColumnTransform::Stream(stream) => { let output_schema = stream.schema(); check_names(output_schema.as_ref())?; let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::Reader(reader) => { let output_schema = reader.schema(); check_names(output_schema.as_ref())?; let stream = reader.into_stream(); let fragments = add_columns_from_stream(fragments, stream, None, batch_size).await?; - Ok((output_schema, fragments)) + Ok((output_schema, fragments.clone(), fragments)) } NewColumnTransform::AllNulls(output_schema) => { check_names(output_schema.as_ref())?; @@ -379,14 +387,20 @@ pub(super) async fn add_columns_to_fragments( )); } - Ok((output_schema, fragments)) + Ok((output_schema, fragments, Vec::new())) } }?; - let mut schema = dataset.schema().merge(output_schema.as_ref())?; + let mut schema = match dataset.schema().merge(output_schema.as_ref()) { + Ok(schema) => schema, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; schema.set_field_id(Some(dataset.manifest.max_field_id())); - Ok((fragments, schema)) + Ok((new_fragments, schema, fragments_to_cleanup)) } pub(super) async fn add_columns( @@ -395,7 +409,7 @@ pub(super) async fn add_columns( read_columns: Option>, batch_size: Option, ) -> Result<()> { - let (fragments, schema) = add_columns_to_fragments( + let (fragments, schema, fragments_to_cleanup) = add_columns_to_fragments( dataset, transforms, read_columns, @@ -406,11 +420,75 @@ pub(super) async fn add_columns( let operation = Operation::Merge { fragments, schema }; let transaction = Transaction::new(dataset.manifest.version, operation, None); - dataset + match dataset .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + .await + { + Ok(()) => Ok(()), + Err(e) => { + cleanup_new_column_data_files(&dataset.get_fragments(), &fragments_to_cleanup).await; + Err(e) + } + } +} - Ok(()) +async fn cleanup_new_column_data_files(fragments: &[FileFragment], new_fragments: &[Fragment]) { + let Some(first_fragment) = fragments.first() else { + return; + }; + + // add_columns rewrites fragment metadata in place, so cleanup must delete + // only files created by the current attempt and must not touch pre-existing + // files that still belong to the fragment. + let original_files_by_fragment = fragments + .iter() + .map(|fragment| { + let files = fragment + .metadata + .files + .iter() + .map(|file| (file.base_id, file.path.clone())) + .collect::>(); + (fragment.id() as u64, files) + }) + .collect::>(); + + let fragments_to_cleanup = new_fragments + .iter() + .filter_map(|fragment| { + let original_files = original_files_by_fragment.get(&fragment.id)?; + let files = fragment + .files + .iter() + .filter(|file| !original_files.contains(&(file.base_id, file.path.clone()))) + .cloned() + .collect::>(); + + if files.is_empty() { + None + } else { + let mut fragment = fragment.clone(); + fragment.files = files; + Some(fragment) + } + }) + .collect::>(); + + cleanup_data_fragments( + &first_fragment.dataset().object_store, + &first_fragment.dataset().base, + &fragments_to_cleanup, + ) + .await; +} + +struct AddColumnFragments { + /// Fragments produced by the add-columns operation and returned to the + /// caller for the final merge commit. + fragments: Vec, + /// Uncommitted fragments whose newly written data files must be removed if + /// the operation fails before the merge commit completes. + fragments_to_cleanup: Vec, } #[allow(clippy::type_complexity)] @@ -421,63 +499,96 @@ async fn add_columns_impl( batch_size: Option, result_cache: Option>, schemas: Option<(Schema, Schema)>, -) -> Result> { +) -> Result { let read_columns_ref = read_columns.as_deref(); let mapper_ref = mapper.as_ref(); - let fragments = futures::stream::iter(fragments) - .then(|fragment| { - let cache_ref = result_cache.clone(); - let schemas_ref = &schemas; - async move { - if let Some(cache) = &cache_ref { - let fragment_id = fragment.id() as u32; - let fragment = cache.get_fragment(fragment_id)?; - if let Some(fragment) = fragment { - return Ok(fragment); - } + + let mut new_fragments = Vec::with_capacity(fragments.len()); + let mut fragments_to_cleanup = Vec::with_capacity(fragments.len()); + + for fragment in fragments { + if let Some(cache) = &result_cache { + let fragment_id = fragment.id() as u32; + let fragment = match cache.get_fragment(fragment_id) { + Ok(fragment) => fragment, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); } + }; + if let Some(fragment) = fragment { + new_fragments.push(fragment); + continue; + } + } - let mut updater = fragment - .updater(read_columns_ref, schemas_ref.clone(), batch_size) - .await?; - - let mut batch_index = 0; - // TODO: the structure of the updater prevents batch-level parallelism here, - // but there is no reason why we couldn't do this in parallel. - while let Some(batch) = updater.next().await? { - let batch_info = BatchInfo { - fragment_id: fragment.id() as u32, - batch_index, - }; + let mut updater = match fragment + .updater(read_columns_ref, schemas.clone(), batch_size) + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + }; + let fragment_result = async { + let mut batch_index = 0; + // TODO: the structure of the updater prevents batch-level parallelism here, + // but there is no reason why we couldn't do this in parallel. + while let Some(batch) = updater.next().await? { + let batch_info = BatchInfo { + fragment_id: fragment.id() as u32, + batch_index, + }; - let new_batch = if let Some(cache) = &cache_ref { - if let Some(batch) = cache.get_batch(&batch_info)? { - batch - } else { - let new_batch = mapper_ref(batch)?; - cache.insert_batch(batch_info, new_batch.clone())?; - new_batch - } + let new_batch = if let Some(cache) = &result_cache { + if let Some(batch) = cache.get_batch(&batch_info)? { + batch } else { - mapper_ref(batch)? - }; + let new_batch = mapper_ref(batch)?; + cache.insert_batch(batch_info, new_batch.clone())?; + new_batch + } + } else { + mapper_ref(batch)? + }; - updater.update(new_batch).await?; - batch_index += 1; - } + updater.update(new_batch).await?; + batch_index += 1; + } - let fragment = updater.finish().await?; + let new_fragment = updater.finish().await?; + fragments_to_cleanup.push(new_fragment.clone()); - if let Some(cache) = &cache_ref { - cache.insert_fragment(fragment.clone())?; - } + if let Some(cache) = &result_cache { + // Once the checkpoint store owns this fragment, retries may load + // it back instead of rewriting it. Removing it from the cleanup + // set avoids deleting data that has already been checkpointed. + cache.insert_fragment(new_fragment.clone())?; + fragments_to_cleanup.pop(); + } - Ok::<_, Error>(fragment) + Ok::<_, Error>(new_fragment) + } + .await; + + match fragment_result { + Ok(new_fragment) => { + new_fragments.push(new_fragment); } - }) - .try_collect::>() - .await?; - Ok(fragments) + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &fragments_to_cleanup).await; + return Err(e); + } + } + } + + Ok(AddColumnFragments { + fragments: new_fragments, + fragments_to_cleanup, + }) } async fn add_columns_from_stream( @@ -489,49 +600,69 @@ async fn add_columns_from_stream( let mut new_fragments = Vec::with_capacity(fragments.len()); let mut last_seen_batch: Option = None; for fragment in fragments { - let mut updater = fragment + let mut updater = match fragment .updater::(Some(&[]), schemas.clone(), batch_size) - .await?; - while let Some(batch) = updater.next().await? { - debug_assert_eq!(batch.num_columns(), 1); - let mut rows_remaining = batch.num_rows(); + .await + { + Ok(updater) => updater, + Err(e) => { + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } + }; + let result: Result = async { + while let Some(batch) = updater.next().await? { + debug_assert_eq!(batch.num_columns(), 1); + let mut rows_remaining = batch.num_rows(); - let mut batches = Vec::new(); + let mut batches = Vec::new(); - while rows_remaining > 0 { - let next_batch = if let Some(last_seen_batch) = last_seen_batch { - last_seen_batch - } else { - stream.next().await.ok_or_else(|| { - Error::invalid_input( - "Stream ended before producing values for all rows in dataset", - ) - })?? - }; - let num_rows = next_batch.num_rows(); - if num_rows > rows_remaining { - let new_batch = next_batch.slice(0, rows_remaining); - batches.push(new_batch); - last_seen_batch = - Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); - rows_remaining = 0; - } else { - batches.push(next_batch); - rows_remaining -= num_rows; - last_seen_batch = None; + while rows_remaining > 0 { + let next_batch = if let Some(last_seen) = last_seen_batch.take() { + last_seen + } else { + stream.next().await.ok_or_else(|| { + Error::invalid_input( + "Stream ended before producing values for all rows in dataset", + ) + })?? + }; + let num_rows = next_batch.num_rows(); + if num_rows > rows_remaining { + let new_batch = next_batch.slice(0, rows_remaining); + batches.push(new_batch); + last_seen_batch = + Some(next_batch.slice(rows_remaining, num_rows - rows_remaining)); + rows_remaining = 0; + } else { + batches.push(next_batch); + rows_remaining -= num_rows; + last_seen_batch = None; + } } - } - let new_batch = - arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; + let new_batch = + arrow_select::concat::concat_batches(&batches[0].schema(), batches.iter())?; - updater.update(new_batch).await?; + updater.update(new_batch).await?; + } + updater.finish().await + } + .await; + + match result { + Ok(new_fragment) => new_fragments.push(new_fragment), + Err(e) => { + updater.cleanup_unfinished_writer().await; + cleanup_new_column_data_files(fragments, &new_fragments).await; + return Err(e); + } } - new_fragments.push(updater.finish().await?); } // Ensure the stream is fully consumed if last_seen_batch.is_some() || stream.next().await.is_some() { + cleanup_new_column_data_files(fragments, &new_fragments).await; return Err(Error::invalid_input_source( "Stream produced more values than expected for dataset".into(), )); @@ -653,7 +784,7 @@ pub(super) async fn alter_columns( }; let mapper = Box::new(mapper); - let fragments = add_columns_impl( + let result = add_columns_impl( &dataset.get_fragments(), Some(read_columns), mapper, @@ -666,7 +797,8 @@ pub(super) async fn alter_columns( // Some data files may no longer contain any columns in the dataset (e.g. if every // remaining column has been altered into a different data file) and so we remove them let schema_field_ids = new_schema.field_ids().into_iter().collect::>(); - let fragments = fragments + let fragments = result + .fragments .into_iter() .map(|mut frag| { frag.files.retain(|f| { @@ -762,8 +894,7 @@ pub fn exclude(source: &Schema, other: &Schema, version: &LanceFileVersion) -> R #[cfg(test)] mod test { - use std::collections::HashMap; - use std::sync::Mutex; + use std::{collections::HashMap, fs, num::NonZero, path::Path as StdPath, sync::Mutex}; use crate::dataset::WriteParams; use arrow_array::{ @@ -774,6 +905,7 @@ mod test { use arrow_schema::Fields as ArrowFields; use lance_core::utils::tempfile::TempStrDir; use lance_file::version::LanceFileVersion; + use lance_table::format::{BasePath, DataFile}; use rstest::rstest; // Used to validate that futures returned are Send. @@ -781,6 +913,47 @@ mod test { t } + fn file_paths_in(dir: impl AsRef) -> Vec { + fn collect_files( + base_dir: &StdPath, + dir: &StdPath, + files: &mut Vec, + ) -> std::io::Result<()> { + if !dir.exists() { + return Ok(()); + } + for entry in std::fs::read_dir(dir)? { + let path = entry?.path(); + if path.is_dir() { + collect_files(base_dir, &path, files)?; + } else if path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|file_name| !file_name.starts_with('.')) + { + files.push( + path.strip_prefix(base_dir) + .unwrap() + .to_string_lossy() + .to_string(), + ); + } + } + Ok(()) + } + + let base_dir = dir.as_ref(); + let mut files = Vec::new(); + collect_files(base_dir, base_dir, &mut files).unwrap(); + files.sort(); + files + } + + fn data_file_paths_in(base_dir: &str) -> Vec { + file_paths_in(StdPath::new(base_dir).join("data")) + } + #[tokio::test] async fn test_append_columns_exprs() -> Result<()> { let num_rows = 5; @@ -864,6 +1037,623 @@ mod test { Ok(()) } + #[rstest] + #[tokio::test] + async fn test_add_columns_cleans_up_blob_v2_data_on_stream_error( + #[values( + ("inline", b"inline".to_vec()), + ("packed", vec![1u8; 128 * 1024]), + ("dedicated", vec![2u8; 5 * 1024 * 1024]), + ("external", b"external".to_vec()) + )] + blob_case: (&str, Vec), + ) -> Result<()> { + let (blob_kind, payload) = blob_case; + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let external_dir = tempfile::tempdir()?; + let external_path = external_dir.path().join("blob.bin"); + fs::write(&external_path, &payload)?; + let external_baseline_files = file_paths_in(external_dir.path()); + let external_baseline_payload = fs::read(&external_path)?; + + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + initial_bases: Some(vec![BasePath::new( + 1, + external_dir.path().to_string_lossy().to_string(), + Some("external".to_string()), + false, + )]), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let mut blob_builder = crate::BlobArrayBuilder::new(2); + if blob_kind == "external" { + blob_builder.push_uri(external_path.to_string_lossy())?; + } else { + blob_builder.push_bytes(payload)?; + } + blob_builder.push_bytes(b"extra")?; + let blob_array = blob_builder.finish()?; + let blob_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let blob_batch = RecordBatch::try_new(blob_schema.clone(), vec![blob_array])?; + let reader = RecordBatchIterator::new(vec![Ok(blob_batch)], blob_schema); + + let err = dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("Stream produced more values than expected for dataset") + ); + + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean up new data files and blob v2 sidecars on failure" + ); + assert_eq!( + file_paths_in(external_dir.path()), + external_baseline_files, + "cleanup must not delete external files" + ); + assert_eq!( + fs::read(&external_path)?, + external_baseline_payload, + "cleanup must not modify external files" + ); + dataset.validate().await?; + + Ok(()) + } + + #[tokio::test] + async fn test_cleanup_preserves_checkpointed_fragment_files() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let original_fragments = dataset.get_fragments(); + assert_eq!(original_fragments.len(), 2); + + let data_dir = StdPath::new(test_uri).join("data"); + let cached_file = data_dir.join("checkpointed.lance"); + let cached_blob_dir = data_dir.join("checkpointed"); + fs::write(&cached_file, b"checkpointed data")?; + fs::create_dir_all(&cached_blob_dir)?; + fs::write( + cached_blob_dir.join("00000000000000000000000000000001.blob"), + b"blob", + )?; + + let mut checkpointed_fragment = original_fragments[0].metadata().clone(); + checkpointed_fragment.files.push(DataFile::new( + "checkpointed.lance", + vec![dataset.manifest.max_field_id() + 1], + vec![0], + 2, + 2, + NonZero::new(17), + None, + )); + + #[derive(Default)] + struct CheckpointedFragmentStore { + fragment: Mutex>, + } + + impl UDFCheckpointStore for CheckpointedFragmentStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 0 { + Ok(self.fragment.lock().unwrap().clone()) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Ok(()) + } + } + + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(|_| Err(Error::invalid_input("injected UDF failure"))), + output_schema: Arc::new(ArrowSchema::new(vec![ArrowField::new( + "checkpointed", + DataType::Int32, + true, + )])), + result_checkpoint: Some(Arc::new(CheckpointedFragmentStore { + fragment: Mutex::new(Some(checkpointed_fragment)), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + + assert!( + cached_file.exists(), + "cleanup must not delete fragment files restored from a checkpoint" + ); + assert!( + cached_blob_dir.exists(), + "cleanup must not delete blob sidecars restored from a checkpoint" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_current_blob_v2_writer_on_udf_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let call_count = Arc::new(Mutex::new(0usize)); + let mapper_call_count = call_count.clone(); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut call_count = mapper_call_count.lock().unwrap(); + *call_count += 1; + if *call_count == 2 { + return Err(Error::invalid_input("injected UDF failure")); + } + + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: None, + }); + + let err = dataset + .add_columns(transforms, None, Some(1)) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected UDF failure")); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written by the current unfinished writer" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_on_checkpoint_lookup_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + + struct FailingLookupStore { + inserted: Arc>>, + } + + impl UDFCheckpointStore for FailingLookupStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, fragment_id: u32) -> Result> { + if fragment_id == 1 { + Err(Error::invalid_input("injected checkpoint lookup failure")) + } else { + Ok(None) + } + } + + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } + + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingLookupStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint lookup failure") + ); + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_finished_blob_v2_writer_on_checkpoint_insert_error() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + struct FailingInsertStore; + + impl UDFCheckpointStore for FailingInsertStore { + fn get_batch(&self, _info: &BatchInfo) -> Result> { + Ok(None) + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } + + fn insert_fragment(&self, _fragment: Fragment) -> Result<()> { + Err(Error::invalid_input("injected checkpoint insert failure")) + } + } + + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(FailingInsertStore)), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected checkpoint insert failure") + ); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean finished writer files when checkpoint insert fails" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_cleans_blob_v2_files_on_declared_schema_merge_error() -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..1))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + let baseline_files = data_file_paths_in(test_uri); + + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema: Arc::new(ArrowSchema::new(vec![ + ArrowField::new("declared", DataType::Int32, true), + ArrowField::new("declared", DataType::Int32, true), + ])), + result_checkpoint: None, + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!(matches!(err, Error::Schema { .. })); + assert_eq!( + data_file_paths_in(test_uri), + baseline_files, + "add_columns should clean files written before declared schema merge fails" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_add_columns_preserves_checkpointed_blob_v2_fragment_after_later_failure() + -> Result<()> { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..2))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 1, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await?; + + struct InsertThenFailStore { + inserted: Arc>>, + } + + impl UDFCheckpointStore for InsertThenFailStore { + fn get_batch(&self, info: &BatchInfo) -> Result> { + if info.fragment_id == 1 { + Err(Error::invalid_input("injected later checkpoint failure")) + } else { + Ok(None) + } + } + + fn insert_batch(&self, _info: BatchInfo, _batch: RecordBatch) -> Result<()> { + Ok(()) + } + + fn get_fragment(&self, _fragment_id: u32) -> Result> { + Ok(None) + } + + fn insert_fragment(&self, fragment: Fragment) -> Result<()> { + *self.inserted.lock().unwrap() = Some(fragment); + Ok(()) + } + } + + let inserted = Arc::new(Mutex::new(None)); + let output_schema = Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])); + let mapper = move |batch: &RecordBatch| { + let mut blob_builder = crate::BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + blob_builder.push_bytes(vec![7u8; 5 * 1024 * 1024])?; + } + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![crate::blob_field("blob", true)])), + vec![blob_builder.finish()?], + )?) + }; + let transforms = NewColumnTransform::BatchUDF(BatchUDF { + mapper: Box::new(mapper), + output_schema, + result_checkpoint: Some(Arc::new(InsertThenFailStore { + inserted: inserted.clone(), + })), + }); + + let err = dataset + .add_columns(transforms, None, None) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected later checkpoint failure") + ); + + let inserted = inserted.lock().unwrap().clone().unwrap(); + let new_file = inserted + .files + .iter() + .find(|file| { + file.fields + .iter() + .any(|field| *field > dataset.manifest.max_field_id()) + }) + .expect("checkpoint should record the newly written data file"); + let new_file_path = StdPath::new(test_uri).join("data").join(&new_file.path); + let new_blob_dir = StdPath::new(test_uri) + .join("data") + .join(StdPath::new(&new_file.path).file_stem().unwrap()); + assert!( + new_file_path.exists(), + "cleanup must not delete data files after checkpoint takes ownership" + ); + assert!( + new_blob_dir.exists(), + "cleanup must not delete blob sidecars after checkpoint takes ownership" + ); + + Ok(()) + } + #[rstest] #[tokio::test] async fn test_append_columns_udf( diff --git a/rust/lance/src/dataset/updater.rs b/rust/lance/src/dataset/updater.rs index b9bc34f8706..90ef8df914b 100644 --- a/rust/lance/src/dataset/updater.rs +++ b/rust/lance/src/dataset/updater.rs @@ -6,13 +6,13 @@ use futures::StreamExt; use lance_core::datatypes::{OnMissing, OnTypeMismatch}; use lance_core::utils::deletion::DeletionVector; use lance_core::{Error, Result, datatypes::Schema}; -use lance_table::format::Fragment; +use lance_table::format::{DataFile, Fragment}; use lance_table::utils::stream::ReadBatchFutStream; use super::Dataset; use super::fragment::FragmentReader; use super::scanner::get_default_batch_size; -use super::write::{GenericWriter, open_writer}; +use super::write::{GenericWriter, cleanup_data_fragments, open_update_writer}; use crate::dataset::FileFragment; use crate::dataset::utils::SchemaAdapter; @@ -146,13 +146,7 @@ impl Updater { .data_storage_format .lance_file_version()?; - open_writer( - &self.fragment.dataset().object_store, - &schema, - &self.fragment.dataset().base, - data_storage_version, - ) - .await + open_update_writer(self.dataset(), &schema, data_storage_version).await } /// Update one batch. @@ -221,6 +215,34 @@ impl Updater { Ok(self.fragment.metadata().clone()) } + /// Clean up any data file and blob sidecars created by the current unfinished writer. + pub(super) async fn cleanup_unfinished_writer(&mut self) { + let Some(writer) = self.writer.take() else { + return; + }; + let (path, base_id) = writer.data_file_path(); + let path = path.to_string(); + drop(writer); + + if path.is_empty() { + return; + } + + let mut fragment = Fragment::new(self.fragment.id() as u64); + // cleanup_data_fragments only needs path/base_id to remove the unfinished + // data file and any blob sidecars. Build a minimal synthetic fragment so + // we can reuse the shared cleanup path without fabricating full metadata. + fragment + .files + .push(DataFile::new(path, vec![], vec![], 0, 0, None, base_id)); + cleanup_data_fragments( + &self.dataset().object_store, + &self.dataset().base, + &[fragment], + ) + .await; + } + /// Get the final schema of the fragment after the update. /// /// This may be None if the schema is not known. This can happen if it was diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 1e73618fc6b..b42b1f1cba9 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1021,6 +1021,8 @@ pub async fn write_fragments_internal( pub trait GenericWriter: Send { /// Write the given batches to the file async fn write(&mut self, batches: &[RecordBatch]) -> Result<()>; + /// Get the file path and base ID for the data file being written. + fn data_file_path(&self) -> (&str, Option); /// Get the current position in the file /// /// We use this to know when the file is too large and we need to start @@ -1047,6 +1049,9 @@ where async fn write(&mut self, batches: &[RecordBatch]) -> Result<()> { self.writer.write(batches).await } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await? as u64) } @@ -1087,6 +1092,9 @@ impl GenericWriter for V2WriterAdapter { } Ok(()) } + fn data_file_path(&self) -> (&str, Option) { + (&self.path, self.base_id) + } async fn tell(&mut self) -> Result { Ok(self.writer.tell().await?) } @@ -1140,6 +1148,39 @@ pub async fn open_writer( .await } +pub(super) async fn open_update_writer( + dataset: &Dataset, + schema: &Schema, + storage_version: LanceFileVersion, +) -> Result> { + // add_columns / alter_columns reuse the normal writer stack, but they do not + // flow through WriteParams. Rebuild the external base resolver here so blob + // v2 reference columns can resolve dataset-registered external URIs. + let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 + && schema.fields.iter().any(|f| f.is_blob_v2()) + { + Some(Arc::new( + build_external_base_resolver(Some(dataset), &WriteParams::default()).await?, + )) + } else { + None + }; + + open_writer_with_options( + &dataset.object_store, + schema, + &dataset.base, + storage_version, + WriterOptions { + add_data_dir: true, + external_base_resolver, + source_store_registry: dataset.session.store_registry(), + ..Default::default() + }, + ) + .await +} + #[derive(Default)] struct WriterOptions { add_data_dir: bool, From e380167795f19a5ed4078868b59d4991b2c7f45a Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Fri, 12 Jun 2026 23:34:20 +0800 Subject: [PATCH 094/177] fix(python): expose stable row id property in stub (#7249) ## Feature ### What is the new feature? This PR makes the existing public Python helper, `LanceDataset.has_stable_row_ids`, fully usable and documented for downstream callers. It does not add a second synonymous public API. The single supported Python entry point remains the existing `has_stable_row_ids` property. ### Why do we need this feature? Downstream users need a stable public API for detecting stable row ID support without parsing serialized manifest bytes or hard-coding feature flag bits. This is especially useful for empty datasets, where there may be no fragments to inspect but the manifest feature flag still records whether stable row IDs are enabled. ### How does it work? - Keeps `LanceDataset.has_stable_row_ids` as the single Python helper. - Documents that the property is based on the dataset manifest feature flag, not fragments. - Adds the missing `_Dataset.has_stable_row_ids` entry to the PyO3 type stub. - Expands dataset tests to cover stable and non-stable datasets for both empty and non-empty tables. Geneva should replace serialized manifest byte parsing with `dataset.has_stable_row_ids` once it depends on a Lance version containing this PR. ## Validation Passed: - `make install` from `python/` - `uv run pytest python/tests/test_dataset.py::test_has_stable_row_ids_property` - `uv run ruff check python/lance/dataset.py python/lance/lance/__init__.pyi python/tests/test_dataset.py` - `cargo fmt --manifest-path python/Cargo.toml --all --check` - `cargo fmt --manifest-path python/Cargo.toml --all` - `cargo clippy --manifest-path python/Cargo.toml --all-targets -- -D warnings` - `git diff --check` - pre-commit hooks during commit/amend: `ruff`, `ruff-format`, `fmt`, `typos` Known validation caveat: - `uv run make lint` passed ruff and ruff-format, then failed in `pyright` on existing environment/type-resolution issues outside this change: unresolved optional `tensorflow` and `torch` imports in `python/lance/dependencies.py`, plus Python 3.14 stdlib parsing issues from pyright 1.1.406. --- python/python/lance/dataset.py | 5 ++++- python/python/lance/lance/__init__.pyi | 2 ++ python/python/tests/test_dataset.py | 27 ++++++++++++++++++-------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index dae72b88b1c..45dc1b253d3 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -1353,7 +1353,10 @@ def data_storage_version(self) -> str: @property def has_stable_row_ids(self) -> bool: """ - Whether this dataset has stable row IDs enabled + Whether this dataset has stable row IDs enabled. + + This is based on the dataset manifest feature flag and does not depend on + whether the current version has any fragments. """ return self._ds.has_stable_row_ids diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 38d82738063..74db076db41 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -226,6 +226,8 @@ class _Dataset: def replace_field_metadata(self, field_name: str, metadata: Dict[str, str]): ... @property def data_storage_version(self) -> str: ... + @property + def has_stable_row_ids(self) -> bool: ... def index_statistics(self, index_name: str) -> str: ... def serialized_manifest(self) -> bytes: ... def describe_indices(self) -> List[IndexDescription]: ... diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 89bd78b82c8..39dac98aec6 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -424,16 +424,27 @@ def test_enable_stable_row_ids(tmp_path: Path): assert table_after["_rowaddr"][3].as_py() == (2 << 32) + 3 -def test_has_stable_row_ids_property(tmp_path: Path): - table = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) +@pytest.mark.parametrize("enable_stable_row_ids", [True, False]) +@pytest.mark.parametrize( + "rows", + [[{"a": 1}, {"a": 2}], []], + ids=["non_empty", "empty"], +) +def test_has_stable_row_ids_property(tmp_path: Path, enable_stable_row_ids: bool, rows): + schema = pa.schema([pa.field("a", pa.int64())]) + table = pa.Table.from_pylist(rows, schema=schema) - stable_path = tmp_path / "stable" - lance.write_dataset(table, stable_path, enable_stable_row_ids=True) - assert lance.dataset(stable_path).has_stable_row_ids is True + path = tmp_path / f"stable_row_ids_{enable_stable_row_ids}_{len(rows)}" + lance.write_dataset( + table, + path, + enable_stable_row_ids=enable_stable_row_ids, + ) + ds = lance.dataset(path) - non_stable_path = tmp_path / "non_stable" - lance.write_dataset(table, non_stable_path, enable_stable_row_ids=False) - assert lance.dataset(non_stable_path).has_stable_row_ids is False + assert ds.count_rows() == len(rows) + assert len(ds.get_fragments()) == (0 if len(rows) == 0 else 1) + assert ds.has_stable_row_ids is enable_stable_row_ids def _list_manifests(versions_dir): From 9714d8ba1c6377ecf68103a6802d7ce8a36c0978 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Fri, 12 Jun 2026 15:35:58 +0000 Subject: [PATCH 095/177] chore: release beta version 8.0.0-beta.13 --- .bumpversion.toml | 2 +- Cargo.lock | 74 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++------------ java/lance-jni/Cargo.lock | 66 +++++++++++++++++----------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 66 +++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 126 insertions(+), 132 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index b3ca85f628e..32cca52aa9a 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.12" +current_version = "8.0.0-beta.13" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 64449a632ec..32baccd1ce0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1315,9 +1315,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -2719,7 +2719,6 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ - "powerfmt", "serde_core", ] @@ -3146,7 +3145,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4458,7 +4457,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "all_asserts", "approx", @@ -4561,7 +4560,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4609,7 +4608,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrayref", "paste", @@ -4618,7 +4617,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4658,7 +4657,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4691,7 +4690,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4711,7 +4710,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "proc-macro2", "quote", @@ -4720,7 +4719,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4765,7 +4764,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "all_asserts", "arrow", @@ -4791,7 +4790,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4830,7 +4829,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "datafusion", "geo-traits", @@ -4844,7 +4843,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "approx", "arc-swap", @@ -4920,7 +4919,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-arith", @@ -4968,7 +4967,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "approx", "arrow-array", @@ -4987,7 +4986,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "async-trait", @@ -4999,7 +4998,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-schema", @@ -5015,7 +5014,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -5078,7 +5077,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -5096,7 +5095,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -5142,7 +5141,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "proc-macro2", "quote", @@ -5151,7 +5150,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-schema", @@ -5164,7 +5163,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5176,7 +5175,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "clap", "lance-core", @@ -5524,9 +5523,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -8353,9 +8352,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -8841,12 +8840,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.47" +version = "0.3.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" dependencies = [ "deranged", - "itoa", "num-conv", "powerfmt", "serde_core", @@ -8856,15 +8854,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" [[package]] name = "time-macros" -version = "0.2.27" +version = "0.2.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" dependencies = [ "num-conv", "time-core", diff --git a/Cargo.toml b/Cargo.toml index 595d1fe41d6..c9740be6642 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.12", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.12", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.12", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.12", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.12", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.12", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.12", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.12", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.12", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.12", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.12", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.12", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.12", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.12", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.13", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.13", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.13", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.13", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.13", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.13", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.13", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.13", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.13", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.13", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.13", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.13", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.13", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.13", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.4" -lance-select = { version = "=8.0.0-beta.12", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.12", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.12", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.12", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.12", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.13", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.13", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.13", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.13", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.13", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.12", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.13", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = "53.0.0" dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.12", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.13", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 3cd9fc35067..df24deeca17 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -1090,9 +1090,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -2276,7 +2276,6 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ - "powerfmt", "serde_core", ] @@ -2549,7 +2548,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3749,7 +3748,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3821,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3863,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrayref", "paste", @@ -3873,7 +3872,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3910,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3942,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -3961,7 +3960,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3969,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +4004,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +4034,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +4048,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arc-swap", "arrow", @@ -4116,7 +4115,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4156,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4207,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4219,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-ipc", @@ -4268,7 +4267,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4283,7 +4282,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4320,7 +4319,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -4575,9 +4574,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "mime" @@ -6796,9 +6795,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -7147,12 +7146,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.47" +version = "0.3.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" dependencies = [ "deranged", - "itoa", "num-conv", "powerfmt", "serde_core", @@ -7162,15 +7160,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" [[package]] name = "time-macros" -version = "0.2.27" +version = "0.2.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" dependencies = [ "num-conv", "time-core", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 035a0e0ce8c..eee207d912e 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 3a377c35150..9a88cf039b5 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.12 + 8.0.0-beta.13 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 0be9c074402..eb9a1abe6bb 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1252,9 +1252,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.63" +version = "1.2.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" dependencies = [ "find-msvc-tools", "jobserver", @@ -2586,7 +2586,6 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ - "powerfmt", "serde_core", ] @@ -2899,7 +2898,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4115,7 +4114,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arc-swap", "arrow", @@ -4189,7 +4188,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4231,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrayref", "paste", @@ -4240,7 +4239,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4278,7 +4277,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4310,7 +4309,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4328,7 +4327,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "proc-macro2", "quote", @@ -4337,7 +4336,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4372,7 +4371,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-arith", "arrow-array", @@ -4402,7 +4401,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "datafusion", "geo-traits", @@ -4416,7 +4415,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arc-swap", "arrow", @@ -4484,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-arith", @@ -4525,7 +4524,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4540,7 +4539,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "async-trait", @@ -4552,7 +4551,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-ipc", @@ -4600,7 +4599,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow-array", "arrow-buffer", @@ -4615,7 +4614,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -4654,7 +4653,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5001,9 +5000,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.1" +version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" [[package]] name = "memmap2" @@ -6142,7 +6141,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" dependencies = [ "arrow", "arrow-array", @@ -7537,9 +7536,9 @@ checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" -version = "1.15.1" +version = "1.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" [[package]] name = "snafu" @@ -7940,12 +7939,11 @@ dependencies = [ [[package]] name = "time" -version = "0.3.47" +version = "0.3.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" dependencies = [ "deranged", - "itoa", "num-conv", "powerfmt", "serde_core", @@ -7955,15 +7953,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" +checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" [[package]] name = "time-macros" -version = "0.2.27" +version = "0.2.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" dependencies = [ "num-conv", "time-core", diff --git a/python/Cargo.toml b/python/Cargo.toml index e066e7d1077..5530f6ff60a 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.12" +version = "8.0.0-beta.13" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From b9dc32dd5cf0beb06d74af468c165c80c0cc8b09 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 12 Jun 2026 09:23:04 -0700 Subject: [PATCH 096/177] perf(dir-catalog): avoid unchanged manifest reloads (#7234) Summary: - add dataset stale/successor helpers and commit-handler version probes - use the successor probe to skip unchanged directory manifest reloads - cover V1/V2 successor checks, external-manifest fallback, and cross-namespace manifest refresh --- rust/lance-namespace-impls/src/dir.rs | 34 +++++++++ .../lance-namespace-impls/src/dir/manifest.rs | 32 ++++++--- rust/lance-table/src/io/commit.rs | 20 ++++++ .../src/io/commit/external_manifest.rs | 25 +++++++ rust/lance/src/dataset.rs | 33 +++++++++ .../src/dataset/tests/dataset_versioning.rs | 71 +++++++++++++++++++ rust/lance/src/io/commit/external_manifest.rs | 26 +++++++ 7 files changed, 232 insertions(+), 9 deletions(-) diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 6adc233d8a7..2a6921530c3 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -11779,6 +11779,40 @@ mod tests { ); } + #[tokio::test] + async fn test_manifest_reload_observes_new_version_from_other_namespace() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace_a = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_a, "alpha").await; + + let namespace_b = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + create_scalar_table(&namespace_b, "beta").await; + + let response = namespace_a + .list_tables(ListTablesRequest { + id: Some(vec![]), + ..Default::default() + }) + .await + .unwrap(); + + let mut tables = response.tables; + tables.sort(); + assert_eq!(tables, vec!["alpha", "beta"]); + } + #[tokio::test] async fn test_migration_not_found_errors_include_table_id() { let temp_dir = TempStdDir::default(); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 067239b8765..11c9e1c193d 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -670,13 +670,23 @@ pub struct NamespaceInfo { /// A wrapper around a Dataset that provides concurrent access. /// /// This can be cloned cheaply. It supports concurrent reads or exclusive writes. -/// The manifest dataset is always kept strongly consistent by reloading on each read. +/// The manifest dataset uses contiguous attached versions and this module never +/// runs old-version cleanup on it, allowing reads to check only the immediate +/// successor manifest before deciding whether a reload is needed. #[derive(Debug, Clone)] pub struct DatasetConsistencyWrapper(Arc>); impl DatasetConsistencyWrapper { /// Create a new wrapper with the given dataset. pub fn new(dataset: Dataset) -> Self { + debug_assert!( + !dataset + .manifest() + .config + .keys() + .any(|key| key.starts_with("lance.auto_cleanup.")), + "the directory manifest dataset must not enable old-version cleanup" + ); Self(Arc::new(RwLock::new(dataset))) } @@ -728,21 +738,25 @@ impl DatasetConsistencyWrapper { dataset_uri, current_version ); - let latest_version = read_guard.latest_version_id().await.map_err(|e| { + // The directory manifest table uses contiguous attached versions and + // does not run old-version cleanup, so the immediate successor probe is + // enough to detect changes without resolving or loading the latest + // manifest on every namespace read. + let has_successor_version = read_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; log::debug!( - "Reload got latest_version={} for uri={}, current_version={}", - latest_version, + "Reload checked successor_version_exists={} for uri={}, current_version={}", + has_successor_version, dataset_uri, current_version ); drop(read_guard); // If already up-to-date, return early - if latest_version == current_version { + if !has_successor_version { log::debug!("Already up-to-date for uri={}", dataset_uri); return Ok(()); } @@ -751,13 +765,13 @@ impl DatasetConsistencyWrapper { let mut write_guard = self.0.write().await; // Double-check after acquiring write lock (someone else might have reloaded) - let latest_version = write_guard.latest_version_id().await.map_err(|e| { + let has_successor_version = write_guard.has_successor_version().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to get latest version: {:?}", e), + message: format!("Failed to check dataset staleness: {:?}", e), }) })?; - if latest_version != write_guard.version().version { + if has_successor_version { write_guard.checkout_latest().await.map_err(|e| { lance_core::Error::from(NamespaceError::Internal { message: format!("Failed to checkout latest: {:?}", e), diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index 3784e84a785..e1a4086730b 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -798,6 +798,26 @@ pub trait CommitHandler: Debug + Send + Sync { default_resolve_version(base_path, version, object_store).await } + /// Check whether an attached manifest version exists without loading it. + /// + /// The default implementation probes the deterministic manifest path for + /// the given naming scheme. Commit handlers with an external source of + /// truth should override this method. + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + /// List detached manifest locations. /// /// Returns a stream of detached manifest locations in arbitrary order. diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs index 75993ca8d1f..a6c9bbaa90d 100644 --- a/rust/lance-table/src/io/commit/external_manifest.rs +++ b/rust/lance-table/src/io/commit/external_manifest.rs @@ -456,6 +456,31 @@ impl CommitHandler for ExternalManifestCommitHandler { .await } + async fn version_exists( + &self, + base_path: &Path, + version: u64, + object_store: &dyn OSObjectStore, + naming_scheme: ManifestNamingScheme, + ) -> Result { + match self + .external_manifest_store + .get_manifest_location(base_path.as_ref(), version) + .await + { + Ok(_) => Ok(true), + Err(Error::NotFound { .. }) => { + let path = naming_scheme.manifest_path(base_path, version); + match object_store.head(&path).await { + Ok(_) => Ok(true), + Err(ObjectStoreError::NotFound { .. }) => Ok(false), + Err(e) => Err(e.into()), + } + } + Err(e) => Err(e), + } + } + async fn commit( &self, manifest: &mut Manifest, diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 23d824fd6fd..3e0d77704da 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -2241,6 +2241,39 @@ impl Dataset { .version) } + /// Return whether the dataset has a newer committed version. + pub async fn is_stale(&self) -> Result { + let latest_version = self.latest_version_id().await?; + Ok(latest_version != self.manifest.version) + } + + /// Return whether the immediate attached successor manifest exists. + /// + /// This is a fast contiguous-history probe. It does not resolve the latest + /// version and may return `false` if intermediate manifests have been + /// removed. Callers that need a general freshness check should use + /// [`Self::is_stale`]. + #[doc(hidden)] + pub async fn has_successor_version(&self) -> Result { + let Some(next_version) = self.manifest.version.checked_add(1) else { + return Ok(false); + }; + if lance_table::format::is_detached_version(next_version) { + return Ok(false); + } + + let exists = self + .commit_handler + .version_exists( + &self.base, + next_version, + self.object_store.inner.as_ref(), + self.manifest_location.naming_scheme, + ) + .await?; + Ok(exists) + } + pub fn count_fragments(&self) -> usize { self.manifest.fragments.len() } diff --git a/rust/lance/src/dataset/tests/dataset_versioning.rs b/rust/lance/src/dataset/tests/dataset_versioning.rs index a0bc7816a32..c04dd0f3183 100644 --- a/rust/lance/src/dataset/tests/dataset_versioning.rs +++ b/rust/lance/src/dataset/tests/dataset_versioning.rs @@ -211,6 +211,77 @@ async fn test_version_id_fast_path() { assert_eq!(historical.latest_version_id().await.unwrap(), 2); } +#[rstest] +#[tokio::test] +async fn test_stale_checks_cover_fast_successor_and_latest_version( + #[values(false, true)] enable_v2_manifest_paths: bool, +) { + let expected_scheme = if enable_v2_manifest_paths { + ManifestNamingScheme::V2 + } else { + ManifestNamingScheme::V1 + }; + let test_uri = TempStrDir::default(); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::UInt32, + false, + )])); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(0..5))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema.clone()); + + let original = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(original.manifest_location().naming_scheme, expected_scheme); + assert!(!original.is_stale().await.unwrap()); + assert!(!original.has_successor_version().await.unwrap()); + + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt32Array::from_iter_values(5..10))], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![data].into_iter().map(Ok), schema); + let updated = Dataset::write( + reader, + &test_uri, + Some(WriteParams { + mode: WriteMode::Append, + enable_v2_manifest_paths, + ..Default::default() + }), + ) + .await + .unwrap(); + + assert!(original.is_stale().await.unwrap()); + assert!(original.has_successor_version().await.unwrap()); + assert_eq!(updated.manifest_location().naming_scheme, expected_scheme); + assert!(!updated.is_stale().await.unwrap()); + assert!(!updated.has_successor_version().await.unwrap()); + + let historical = updated.checkout_version(1).await.unwrap(); + assert_eq!( + historical.manifest_location().naming_scheme, + expected_scheme + ); + assert!(historical.is_stale().await.unwrap()); + assert!(historical.has_successor_version().await.unwrap()); +} + #[rstest] #[tokio::test] async fn test_restore( diff --git a/rust/lance/src/io/commit/external_manifest.rs b/rust/lance/src/io/commit/external_manifest.rs index df2b84a4878..eee4fbf07b6 100644 --- a/rust/lance/src/io/commit/external_manifest.rs +++ b/rust/lance/src/io/commit/external_manifest.rs @@ -365,6 +365,32 @@ mod test { assert_eq!(ds.version().version, 6); assert_eq!(ds.count_rows(None).await.unwrap(), 60); + { + inner_store.lock().await.remove(&(ds.base.to_string(), 6)); + } + assert!( + handler + .version_exists( + &ds.base, + 6, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + assert!( + !handler + .version_exists( + &ds.base, + 7, + ds.object_store.inner.as_ref(), + ds.manifest_location().naming_scheme, + ) + .await + .unwrap() + ); + // Open without external store handler again, should see the newly sync'd commit let ds = DatasetBuilder::from_uri(ds_uri).load().await.unwrap(); assert_eq!(ds.version().version, 6); From e100ce5e047ac854a6a04a83f8b91ae182c9958b Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 12 Jun 2026 16:35:14 -0700 Subject: [PATCH 097/177] refactor(namespace): remove table_version_storage_enabled and __manifest version storage (#7222) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `__manifest` cannot sustain per-commit version-write TPS; it suits table create/remove, not every commit. This removes the `table_version_storage_enabled` option and the entire `__manifest`-backed table-version path. Version list/create/describe/batch-delete now use the physical `_versions/` directory exclusively, which was always the source of truth, so no data is lost. Kept: `table_version_tracking_enabled` (managed versioning) and `manifest_enabled` (table tracking) — both unaffected and served from physical storage. The multi-table transaction work that motivated version storage will be re-proposed on a separate mechanism. --- Cargo.lock | 15 +- java/lance-jni/Cargo.lock | 15 +- python/Cargo.lock | 15 +- rust/lance-namespace-impls/Cargo.toml | 5 + rust/lance-namespace-impls/src/dir.rs | 538 +--------------- .../lance-namespace-impls/src/dir/manifest.rs | 582 +----------------- .../lance-namespace-impls/src/rest_adapter.rs | 7 +- 7 files changed, 56 insertions(+), 1121 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 32baccd1ce0..2050bd0698e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2719,6 +2719,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ + "powerfmt", "serde_core", ] @@ -5053,6 +5054,7 @@ dependencies = [ "serde_json", "sha2 0.10.9", "tempfile", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -8840,11 +8842,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.48" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -8854,15 +8857,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.9" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.28" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index df24deeca17..5733f730fc0 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2276,6 +2276,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ + "powerfmt", "serde_core", ] @@ -4244,6 +4245,7 @@ dependencies = [ "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -7146,11 +7148,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.48" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -7160,15 +7163,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.9" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.28" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", diff --git a/python/Cargo.lock b/python/Cargo.lock index eb9a1abe6bb..f2f9990e7cd 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2586,6 +2586,7 @@ version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ + "powerfmt", "serde_core", ] @@ -4576,6 +4577,7 @@ dependencies = [ "roaring", "serde", "serde_json", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -7939,11 +7941,12 @@ dependencies = [ [[package]] name = "time" -version = "0.3.48" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1aa89044e7786ffb2ec017acb22cb7de5b0be46d0f21aea2b224b8561e5db2" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -7953,15 +7956,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.9" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1c906769ad99c88eaa54e728060edef082f8e358ff32030cb7c7d315e81109" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.28" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d3bfe86347f0cc659f586f01e26303ccd32418f26f30c7b0309b3ca3a07d695" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index c2bf057ee21..27b9a4bc0e2 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -79,6 +79,11 @@ base64 = { version = "0.22", optional = true } aws-sdk-sts = { version = "1.38.0", optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] } aws-config = { workspace = true, optional = true } +# Pin: time 0.3.48 conflicts with aws-smithy-types (E0119: conflicting `From` impls), which this +# crate pulls in via the AWS credential vendor. Capping time here forces the workspace resolver to +# 0.3.47 even for no-lock builds. Not used directly; remove once the upstream conflict is resolved. +time = "=0.3.47" + # GCP credential vending dependencies (optional, enabled by "credential-vendor-gcp" feature) ring = { version = "0.17", optional = true } rustls-pki-types = { version = "1", optional = true } diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 2a6921530c3..b8b8a126b7c 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -195,9 +195,6 @@ pub struct DirectoryNamespaceBuilder { dir_listing_enabled: bool, inline_optimization_enabled: bool, table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table instead of - /// relying on Lance's native version management. - table_version_storage_enabled: bool, /// When true, enables migration mode where the namespace checks the manifest first /// before falling back to directory listing for root-level tables. When false (default), /// root-level tables use directory listing directly without checking the manifest, @@ -233,10 +230,6 @@ impl std::fmt::Debug for DirectoryNamespaceBuilder { "table_version_tracking_enabled", &self.table_version_tracking_enabled, ) - .field( - "table_version_storage_enabled", - &self.table_version_storage_enabled, - ) .field( "dir_listing_to_manifest_migration_enabled", &self.dir_listing_to_manifest_migration_enabled, @@ -273,7 +266,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, table_version_tracking_enabled: false, // Default to disabled - table_version_storage_enabled: false, // Default to disabled dir_listing_to_manifest_migration_enabled: false, // Default to disabled credential_vendor_properties: HashMap::new(), context_provider: None, @@ -334,19 +326,6 @@ impl DirectoryNamespaceBuilder { self } - /// Enable or disable table version management through the `__manifest` table. - /// - /// When enabled, table versions are tracked as `table_version` entries in the - /// `__manifest` Lance table. This enables: - /// - Centralized version tracking instead of per-table `_versions/` directories - /// - /// Requires `manifest_enabled` to be true. - /// When disabled (default), version storage uses per-table storage operations. - pub fn table_version_storage_enabled(mut self, enabled: bool) -> Self { - self.table_version_storage_enabled = enabled; - self - } - /// Create a DirectoryNamespaceBuilder from properties HashMap. /// /// This method parses a properties map into builder configuration. @@ -464,12 +443,6 @@ impl DirectoryNamespaceBuilder { .and_then(|v| v.parse::().ok()) .unwrap_or(false); - // Extract table_version_storage_enabled (default: false) - let table_version_storage_enabled = properties - .get("table_version_storage_enabled") - .and_then(|v| v.parse::().ok()) - .unwrap_or(false); - // Extract dir_listing_to_manifest_migration_enabled (default: false) let dir_listing_to_manifest_migration_enabled = properties .get("dir_listing_to_manifest_migration_enabled") @@ -516,7 +489,6 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled, inline_optimization_enabled, table_version_tracking_enabled, - table_version_storage_enabled, dir_listing_to_manifest_migration_enabled, credential_vendor_properties, context_provider: None, @@ -693,14 +665,6 @@ impl DirectoryNamespaceBuilder { /// - Connection to the storage backend fails /// - Storage options are invalid pub async fn build(self) -> Result { - // Validate: table_version_storage_enabled requires manifest_enabled - if self.table_version_storage_enabled && !self.manifest_enabled { - return Err(NamespaceError::InvalidInput { - message: "table_version_storage_enabled requires manifest_enabled=true".to_string(), - } - .into()); - } - let (object_store, base_path) = Self::initialize_object_store(&self.root, &self.storage_options, &self.session).await?; @@ -714,7 +678,6 @@ impl DirectoryNamespaceBuilder { self.dir_listing_enabled, self.inline_optimization_enabled, self.commit_retries, - self.table_version_storage_enabled, ) .await { @@ -759,7 +722,6 @@ impl DirectoryNamespaceBuilder { dir_listing_to_manifest_migration_enabled: self .dir_listing_to_manifest_migration_enabled, table_version_tracking_enabled: self.table_version_tracking_enabled, - table_version_storage_enabled: self.table_version_storage_enabled, credential_vendor, context_provider: self.context_provider, vend_input_storage_options: self.vend_input_storage_options, @@ -842,8 +804,6 @@ pub struct DirectoryNamespace { /// When true, `describe_table` returns `managed_versioning: true` to indicate /// commits should go through namespace table version APIs. table_version_tracking_enabled: bool, - /// When true, table versions are stored in the `__manifest` table. - table_version_storage_enabled: bool, /// Credential vendor created once during initialization. /// Used to vend temporary credentials for table access. credential_vendor: Option>, @@ -2211,18 +2171,16 @@ impl DirectoryNamespace { Ok(migrated_count) } - /// Delete physical manifest files for the given table version ranges (best-effort). + /// Delete physical manifest files for the given table version ranges. /// - /// This helper is used by `batch_delete_table_versions` in both the manifest-enabled - /// and non-manifest paths. It resolves each table's storage location, computes the - /// version file paths, and attempts to delete them. Errors are logged (best-effort) - /// when `best_effort` is true, or returned immediately when false. + /// This helper backs `batch_delete_table_versions`. It resolves each table's storage + /// location, computes the version file paths, and deletes them, returning an error on + /// the first failure. /// /// Returns the number of files successfully deleted. async fn delete_physical_version_files( &self, table_entries: &[TableDeleteEntry], - best_effort: bool, branch: Option<&str>, ) -> Result { let mut deleted_count = 0i64; @@ -2268,22 +2226,13 @@ impl DirectoryNamespace { } Err(object_store::Error::NotFound { .. }) => {} Err(e) => { - if best_effort { - log::warn!( - "Failed to delete manifest file for version {} of table {:?}: {:?}", - v, - te.table_id, - e - ); - } else { - return Err(NamespaceError::Internal { - message: format!( - "Failed to delete version {} for table at '{}': {}", - v, table_uri, e - ), - } - .into()); + return Err(NamespaceError::Internal { + message: format!( + "Failed to delete version {} for table at '{}': {}", + v, table_uri, e + ), } + .into()); } } } @@ -2927,20 +2876,6 @@ impl LanceNamespace for DirectoryNamespace { ) -> Result { self.record_op("list_table_versions"); let branch = Self::normalized_branch(request.branch.as_deref())?; - // The manifest catalog has no branch concept, so a branch lists its own - // version chain from storage under its tree path instead. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id = request.id.clone().unwrap_or_default(); - let want_descending = request.descending == Some(true); - return manifest_ns - .list_table_versions(&table_id, want_descending, request.limit) - .await; - } - - // Fallback when table_version_storage is not enabled: list from _versions/ directory let table_uri = self.resolve_table_location(&request.id).await?; let table_uri = match branch { Some(b) => self.resolve_branch_location(&table_uri, b).await?, @@ -3087,43 +3022,6 @@ impl LanceNamespace for DirectoryNamespace { ); } - // Also record in __manifest (best-effort). Branches aren't tracked there, - // so for a branch the storage manifest above is the only record. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - let table_id_str = - manifest::ManifestNamespace::str_object_id(&request.id.clone().unwrap_or_default()); - let object_id = - manifest::ManifestNamespace::build_version_object_id(&table_id_str, version as i64); - let metadata_json = serde_json::json!({ - "manifest_path": final_path.to_string(), - "manifest_size": manifest_size, - "e_tag": final_meta.e_tag, - "naming_scheme": request.naming_scheme.as_deref().unwrap_or("V2"), - }) - .to_string(); - - if let Err(e) = manifest_ns - .insert_into_manifest_with_metadata( - vec![manifest::ManifestEntry { - object_id, - object_type: manifest::ObjectType::TableVersion, - location: None, - metadata: Some(metadata_json), - }], - None, - ) - .await - { - log::warn!( - "Failed to record table version in __manifest (best-effort): {:?}", - e - ); - } - } - Ok(CreateTableVersionResponse { transaction_id: None, version: Some(Box::new(TableVersion { @@ -3143,18 +3041,6 @@ impl LanceNamespace for DirectoryNamespace { ) -> Result { self.record_op("describe_table_version"); let branch = Self::normalized_branch(request.branch.as_deref())?; - // When table_version_storage_enabled and a specific version is requested, - // query from __manifest to avoid opening the entire dataset. A branch has - // no manifest-catalog entry, so it resolves from storage instead. - if branch.is_none() - && self.table_version_storage_enabled - && let (Some(manifest_ns), Some(version)) = (&self.manifest_ns, request.version) - { - let table_id = request.id.clone().unwrap_or_default(); - return manifest_ns.describe_table_version(&table_id, version).await; - } - - // Fallback when table_version_storage is not enabled: inspect physical manifests directly. let table_uri = self.resolve_table_location(&request.id).await?; let table_uri = match branch { Some(b) => self.resolve_branch_location(&table_uri, b).await?, @@ -3206,9 +3092,9 @@ impl LanceNamespace for DirectoryNamespace { .map(|r| (r.start_version, r.end_version)) .collect(); - // Reject pathological bounded ranges up front: the manifest path below - // builds one id per version, so (0, i64::MAX) would exhaust memory. A - // through-latest range (end < 0) is bounded by the manifests that exist. + // Reject pathological bounded ranges up front: an explicit huge bounded + // range like (0, i64::MAX) is almost certainly a mistake. A through-latest + // range (end < 0) is bounded by the manifests that actually exist on storage. const MAX_VERSIONS_PER_REQUEST: i128 = 1_000_000; let requested: i128 = ranges .iter() @@ -3235,72 +3121,8 @@ impl LanceNamespace for DirectoryNamespace { ranges, }]; - // Branches are not tracked in the manifest catalog, so a branch skips the - // __manifest phase entirely and deletes its physical manifests directly. - if branch.is_none() - && self.table_version_storage_enabled - && let Some(ref manifest_ns) = self.manifest_ns - { - // Through-latest ranges (end_version < 0) would require enumerating the - // __manifest chain up to the latest version, which is not wired up here. - // Reject rather than silently delete physical files while leaving the - // __manifest records in place. - if table_entries - .iter() - .any(|te| te.ranges.iter().any(|&(_, e)| e < 0)) - { - return Err(NamespaceError::Unsupported { - message: "through-latest delete (end_version < 0) is not supported \ - for managed-versioning tables" - .to_string(), - } - .into()); - } - - // Phase 1 (atomic commit point): Delete version records from __manifest - // for ALL tables in a single atomic copy-on-write rewrite. This is the - // authoritative source of truth — once __manifest entries are removed, - // the versions are logically deleted across all tables atomically. - // - // Request `ranges` carry an exclusive end (`[start, end)`); the manifest - // rewrite API matches an inclusive `[start, end]`, so shift the end down - // by one. Empty ranges collapse to start > end and are dropped downstream. - let table_ranges = table_entries - .iter() - .map(|te| { - let object_id = manifest::ManifestNamespace::str_object_id( - &te.table_id.clone().unwrap_or_default(), - ); - let inclusive_ranges = te - .ranges - .iter() - .map(|&(start, end)| (start, end - 1)) - .collect::>(); - (object_id, inclusive_ranges) - }) - .collect::>(); - let total_deleted_count = manifest_ns - .batch_delete_table_versions_by_ranges(&table_ranges) - .await?; - - // Phase 2: Delete physical manifest files (best-effort). - // Even if some file deletions fail, the versions are already removed from - // __manifest, so they won't be visible to readers. Leftover files are - // orphaned but harmless and can be cleaned up later. - let _ = self - .delete_physical_version_files(&table_entries, true, branch) - .await; - - return Ok(BatchDeleteTableVersionsResponse { - deleted_count: Some(total_deleted_count), - transaction_id: None, - }); - } - - // Direct path: delete physical files (no __manifest). Reached when storage - // tracking is off, or for any branch (which has no __manifest entries). let total_deleted_count = self - .delete_physical_version_files(&table_entries, false, branch) + .delete_physical_version_files(&table_entries, branch) .await?; Ok(BatchDeleteTableVersionsResponse { @@ -5376,7 +5198,6 @@ mod tests { DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .ops_metrics_enabled(true) .build() .await @@ -5751,150 +5572,12 @@ mod tests { ); } - /// The managed `__manifest` delete path (the authoritative catalog) must honor - /// the exclusive end: `[min, max)` removes exactly min..max from `__manifest`, - /// keeping max. With storage tracking on, the writes register versions in - /// `__manifest` and `list_table_versions` reads it back, so this exercises the - /// Phase-1 path that the physical-path tests never reach. - #[tokio::test] - async fn test_batch_delete_managed_manifest_exclusive() { - use arrow::array::Int32Array; - use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; - - let temp = TempStdDir::default(); - let ns: Arc = Arc::new( - DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) - .manifest_enabled(true) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(), - ); - let table_id = vec!["users".to_string()]; - let schema = Arc::new(ArrowSchema::new(vec![Field::new( - "id", - DataType::Int32, - false, - )])); - let batch = |seed: i32| { - arrow::record_batch::RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from(vec![seed]))], - ) - .unwrap() - }; - - // Register v1, v2, v3 in __manifest via the managed write flow. - let mut ds = Dataset::write_into_namespace( - RecordBatchIterator::new(vec![Ok(batch(1))], schema.clone()), - ns.clone(), - table_id.clone(), - Some(WriteParams { - mode: WriteMode::Create, - ..Default::default() - }), - ) - .await - .unwrap(); - ds.append( - RecordBatchIterator::new(vec![Ok(batch(2))], schema.clone()), - None, - ) - .await - .unwrap(); - ds.append( - RecordBatchIterator::new(vec![Ok(batch(3))], schema.clone()), - None, - ) - .await - .unwrap(); - - let before = ns - .list_table_versions(ListTableVersionsRequest { - id: Some(table_id.clone()), - ..Default::default() - }) - .await - .unwrap() - .versions; - assert!( - before.len() >= 3, - "expected v1..v3 tracked in __manifest: {:?}", - before - ); - let min_v = before.iter().map(|v| v.version).min().unwrap(); - let max_v = before.iter().map(|v| v.version).max().unwrap(); - - // [min, max): exclusive end keeps max. - ns.batch_delete_table_versions(BatchDeleteTableVersionsRequest { - id: Some(table_id.clone()), - ranges: vec![VersionRange::new(min_v, max_v)], - ..Default::default() - }) - .await - .unwrap(); - - let after = ns - .list_table_versions(ListTableVersionsRequest { - id: Some(table_id.clone()), - ..Default::default() - }) - .await - .unwrap() - .versions; - assert_eq!( - after.len(), - 1, - "only the exclusive end (max) should remain in __manifest: {:?}", - after - ); - assert_eq!(after[0].version, max_v, "max must be kept"); - } - - /// On the managed path, a through-latest delete (`end_version < 0`) is rejected - /// rather than silently deleting physical files while leaving `__manifest` - /// records in place. - #[tokio::test] - async fn test_batch_delete_managed_rejects_through_latest() { - use lance_namespace::models::{BatchDeleteTableVersionsRequest, VersionRange}; - - let temp = TempStdDir::default(); - let ns: Arc = Arc::new( - DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) - .manifest_enabled(true) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(), - ); - - let err = ns - .batch_delete_table_versions(BatchDeleteTableVersionsRequest { - id: Some(vec!["users".to_string()]), - ranges: vec![VersionRange::new(0, -1)], - ..Default::default() - }) - .await; - assert!( - err.is_err(), - "through-latest delete must be rejected on the managed path" - ); - assert!( - err.unwrap_err().to_string().contains("not supported"), - "expected a not-supported error" - ); - } - /// Build a managed (manifest-tracked) namespace over `path`. async fn create_managed_namespace(path: &str) -> Arc { Arc::new( DirectoryNamespaceBuilder::new(path) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .build() .await .unwrap(), @@ -6324,7 +6007,6 @@ mod tests { DirectoryNamespaceBuilder::new(temp.to_str().unwrap()) .manifest_enabled(true) .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) .ops_metrics_enabled(true) .build() .await @@ -6470,49 +6152,6 @@ mod tests { ); } - /// With the manifest store enabled, branch ops must still bypass the catalog - /// fast-path and read the chain from `tree//_versions/`. Without the - /// `branch.is_none()` guard this would query `__manifest` (which has no - /// branch entries) and return the wrong result. The other branch tests use a - /// store-disabled namespace, so this pins the enabled path specifically. - #[tokio::test] - async fn test_branch_ops_skip_manifest_store_when_enabled() { - let temp_dir = TempStdDir::default(); - let namespace = DirectoryNamespaceBuilder::new(temp_dir.to_str().unwrap()) - .manifest_enabled(true) - .table_version_storage_enabled(true) - .build() - .await - .unwrap(); - - create_scalar_table(&namespace, "users").await; - create_branch_with_commits(&namespace, "users", "exp", 2).await; - - // list resolves the branch chain from storage despite storage tracking - // being on (a successful result with tree/exp paths proves the bypass: - // the catalog has no "exp" entry, so the fast-path would not return these). - let branch_versions = list_versions(&namespace, "users", Some("exp")) - .await - .unwrap(); - assert!(branch_versions.len() >= 2); - assert!( - branch_versions - .iter() - .all(|v| v.manifest_path.contains("tree/exp")), - "branch versions must come from branch storage with the store enabled: {:?}", - branch_versions - ); - - // describe likewise resolves from the branch's storage. - let req = DescribeTableVersionRequest { - id: Some(vec!["users".to_string()]), - branch: Some("exp".to_string()), - ..Default::default() - }; - let resp = namespace.describe_table_version(req).await.unwrap(); - assert!(resp.version.manifest_path.contains("tree/exp")); - } - #[tokio::test] async fn test_create_table() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -11277,155 +10916,6 @@ mod tests { } } - /// Tests for multi-table transaction support via table_version_storage_enabled. - mod multi_table_transactions { - use super::*; - use futures::TryStreamExt; - use lance::dataset::builder::DatasetBuilder; - use lance_namespace::models::CreateTableVersionRequest; - - /// Helper to create a namespace with table_version_storage_enabled enabled - async fn create_managed_namespace(temp_path: &str) -> Arc { - Arc::new( - DirectoryNamespaceBuilder::new(temp_path) - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true) - .manifest_enabled(true) - .build() - .await - .unwrap(), - ) - } - - /// Helper to create a table and get its staging manifest path - async fn create_table_and_get_staging( - namespace: Arc, - table_name: &str, - ) -> (Vec, object_store::path::Path) { - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - let mut create_req = CreateTableRequest::new(); - create_req.id = Some(vec![table_name.to_string()]); - namespace - .create_table(create_req, bytes::Bytes::from(ipc_data)) - .await - .unwrap(); - - let table_id = vec![table_name.to_string()]; - let dataset = DatasetBuilder::from_namespace(namespace.clone(), table_id.clone()) - .await - .unwrap() - .load() - .await - .unwrap(); - - // Find existing manifest and create a staging copy - let versions_path = dataset.versions_dir(); - let manifest_metas: Vec<_> = dataset - .object_store(None) - .await - .unwrap() - .inner - .list(Some(&versions_path)) - .try_collect() - .await - .unwrap(); - - let manifest_meta = manifest_metas - .iter() - .find(|m| { - m.location - .filename() - .map(|f| f.ends_with(".manifest")) - .unwrap_or(false) - }) - .expect("No manifest file found"); - - let manifest_data = dataset - .object_store(None) - .await - .unwrap() - .inner - .get(&manifest_meta.location) - .await - .unwrap() - .bytes() - .await - .unwrap(); - - let staging_path = dataset - .versions_dir() - .join(format!("staging_{}", table_name)); - dataset - .object_store(None) - .await - .unwrap() - .inner - .put(&staging_path, manifest_data.into()) - .await - .unwrap(); - - (table_id, staging_path) - } - - #[tokio::test] - async fn test_table_version_storage_enabled_requires_manifest() { - // table_version_storage_enabled=true requires manifest_enabled=true - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); - - let result = DirectoryNamespaceBuilder::new(temp_path) - .table_version_storage_enabled(true) - .manifest_enabled(false) - .build() - .await; - - assert!( - result.is_err(), - "Should fail when table_version_storage_enabled=true but manifest_enabled=false" - ); - } - - #[tokio::test] - async fn test_create_table_version_records_in_manifest() { - // When table_version_storage_enabled is enabled, single create_table_version - // should also record the version in __manifest - let temp_dir = TempStrDir::default(); - let temp_path: &str = &temp_dir; - - let namespace = create_managed_namespace(temp_path).await; - let ns: Arc = namespace.clone(); - - let (table_id, staging_path) = - create_table_and_get_staging(ns.clone(), "table_managed").await; - - // Create version 2 - let mut create_req = CreateTableVersionRequest::new(2, staging_path.to_string()); - create_req.id = Some(table_id.clone()); - create_req.naming_scheme = Some("V2".to_string()); - let response = namespace.create_table_version(create_req).await.unwrap(); - - assert!(response.version.is_some()); - let version = response.version.unwrap(); - assert_eq!(version.version, 2); - - // Verify the version is recorded in __manifest by querying it - let manifest_ns = namespace.manifest_ns.as_ref().unwrap(); - let table_id_str = manifest::ManifestNamespace::str_object_id(&table_id); - let versions = manifest_ns - .query_table_versions(&table_id_str, false, None) - .await - .unwrap(); - - assert!( - !versions.is_empty(), - "Version should be recorded in __manifest" - ); - let (ver, _path) = &versions[0]; - assert_eq!(*ver, 2, "Recorded version should be 2"); - } - } - #[tokio::test] async fn test_list_all_tables() { use lance_namespace::models::ListTablesRequest; diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 11c9e1c193d..4f3e53ba806 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -44,11 +44,10 @@ use lance_namespace::models::{ CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTableVersionResponse, DropNamespaceRequest, - DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, - NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - TableVersion, + DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, + DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, + ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, + TableExistsRequest, }; use lance_namespace::schema::arrow_schema_to_json; use lance_table::feature_flags::apply_feature_flags; @@ -94,7 +93,6 @@ const MANIFEST_INDEX_BATCH_SIZE: usize = 8192; pub enum ObjectType { Namespace, Table, - TableVersion, } impl ObjectType { @@ -102,7 +100,6 @@ impl ObjectType { match self { Self::Namespace => "namespace", Self::Table => "table", - Self::TableVersion => "table_version", } } @@ -110,7 +107,6 @@ impl ObjectType { match s { "namespace" => Ok(Self::Namespace), "table" => Ok(Self::Table), - "table_version" => Ok(Self::TableVersion), _ => Err(NamespaceError::Internal { message: format!("Invalid object type: {}", s), } @@ -173,7 +169,7 @@ pub struct TableInfo { pub struct ManifestEntry { /// The unique object identifier (e.g., table name or version object_id) pub object_id: String, - /// The type of the object (Namespace, Table, or TableVersion) + /// The type of the object (Namespace or Table) pub object_type: ObjectType, /// The storage location (e.g., directory name for tables) pub location: Option, @@ -576,89 +572,6 @@ impl ManifestStreamMutation for DeleteObjectMutation { } } -enum DeleteTableVersionsTarget { - ObjectIds(HashSet), - Ranges(Vec), -} - -#[derive(Clone)] -struct DeleteTableVersionRangeTarget { - object_id_prefix: String, - ranges: Vec<(i64, i64)>, -} - -impl DeleteTableVersionRangeTarget { - fn matches(&self, object_id: &str) -> bool { - let Some(version) = object_id - .strip_prefix(&self.object_id_prefix) - .and_then(|suffix| suffix.parse::().ok()) - else { - return false; - }; - - self.ranges - .iter() - .any(|(start, end)| *start <= version && version <= *end) - } -} - -impl DeleteTableVersionsTarget { - fn matches(&self, object_id: &str) -> bool { - match self { - Self::ObjectIds(object_ids) => object_ids.contains(object_id), - Self::Ranges(targets) => targets.iter().any(|target| target.matches(object_id)), - } - } -} - -struct DeleteTableVersionsMutation { - target: DeleteTableVersionsTarget, - deleted_count: i64, -} - -impl ManifestStreamMutation for DeleteTableVersionsMutation { - type Output = i64; - - fn process_existing_row( - &mut self, - row: ManifestRowValue, - output: &mut ManifestBatchBuilder, - index_data: &mut ManifestIndexAccumulator, - ) -> Result<()> { - if row.object_type == ObjectType::TableVersion && self.target.matches(&row.object_id) { - self.deleted_count += 1; - return Ok(()); - } - - output.append( - index_data, - ManifestOutputRow { - object_id: &row.object_id, - object_type: row.object_type, - location: row.location.as_deref(), - metadata: row.metadata.as_deref(), - base_objects: row.base_objects.as_deref(), - }, - ) - } - - fn append_rows( - &mut self, - _output: &mut ManifestBatchBuilder, - _index_data: &mut ManifestIndexAccumulator, - ) -> Result<()> { - Ok(()) - } - - fn finish(&self) -> CopyOnWriteMutation { - if self.deleted_count > 0 { - CopyOnWriteMutation::updated(self.deleted_count) - } else { - CopyOnWriteMutation::unchanged(0) - } - } -} - /// Information about a namespace stored in the manifest #[derive(Debug, Clone)] pub struct NamespaceInfo { @@ -922,15 +835,10 @@ impl ManifestNamespace { dir_listing_enabled: bool, inline_optimization_enabled: bool, commit_retries: Option, - table_version_storage_enabled: bool, ) -> Result { - let manifest_dataset = Self::ensure_manifest_table_up_to_date( - &root, - &storage_options, - session.clone(), - table_version_storage_enabled, - ) - .await?; + let manifest_dataset = + Self::ensure_manifest_table_up_to_date(&root, &storage_options, session.clone()) + .await?; Ok(Self { root, @@ -994,60 +902,6 @@ impl ManifestNamespace { format!("table id '{}'", Self::str_object_id(table_id)) } - /// Format a version number as a zero-padded lexicographically sortable string. - /// - /// Versions are stored as 20-digit zero-padded integers (e.g., `00000000000000000001` - /// for version 1) so that string-based range queries and sorting work correctly. - pub fn format_table_version(version: i64) -> String { - format!("{:020}", version) - } - - /// Build the object_id for a table version entry. - /// - /// Format: `{table_object_id}${zero_padded_version}` - pub fn build_version_object_id(table_object_id: &str, version: i64) -> String { - format!( - "{}{}{}", - table_object_id, - DELIMITER, - Self::format_table_version(version) - ) - } - - fn build_version_object_id_prefix(table_object_id: &str) -> String { - format!("{}{}", table_object_id, DELIMITER) - } - - fn normalize_table_version_ranges(ranges: &[(i64, i64)]) -> Vec<(i64, i64)> { - let mut normalized = ranges - .iter() - .filter_map(|(start, end)| (*start <= *end).then_some((*start, *end))) - .collect::>(); - normalized.sort_unstable(); - - let mut merged: Vec<(i64, i64)> = Vec::with_capacity(normalized.len()); - for (start, end) in normalized { - let Some((_last_start, last_end)) = merged.last_mut() else { - merged.push((start, end)); - continue; - }; - if start <= *last_end + 1 { - *last_end = (*last_end).max(end); - continue; - } - merged.push((start, end)); - } - merged - } - - /// Parse a version number from the version suffix of a table version object_id. - /// - /// The object_id is formatted as `{table_id}${zero_padded_version}`. - pub fn parse_version_from_object_id(object_id: &str) -> Option { - let (_namespace, name) = Self::parse_object_id(object_id); - name.parse::().ok() - } - /// Generate a new directory name in format: `_` /// The hash is used to (1) optimize object store throughput, /// (2) have high enough entropy in a short period of time to prevent issues like @@ -2423,318 +2277,6 @@ impl ManifestNamespace { .await } - /// Query the manifest for all versions of a table, sorted by version. - /// - /// Returns a list of (version, metadata_json_string) tuples where metadata_json_string - /// contains the full metadata JSON stored in the manifest (manifest_path, manifest_size, - /// e_tag, naming_scheme). - /// - /// **Known limitation**: All matching rows are loaded into memory, sorted in Rust, - /// and then truncated. For tables with a very large number of versions this may be - /// expensive. Pushing sort/limit into the scan is not yet supported by Lance. - pub async fn query_table_versions( - &self, - object_id: &str, - descending: bool, - limit: Option, - ) -> Result> { - let escaped_id = object_id.replace('\'', "''"); - // table_version object_ids are formatted as "{object_id}${zero_padded_version}" - let filter = format!( - "object_type = 'table_version' AND starts_with(object_id, '{}{}')", - escaped_id, DELIMITER - ); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["object_id", "metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut versions: Vec<(i64, String)> = Vec::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - for i in 0..batch.num_rows() { - let oid = object_id_array.value(i); - // Parse version from object_id - if let Some(version) = Self::parse_version_from_object_id(oid) { - let metadata_str = metadata_array.value(i).to_string(); - versions.push((version, metadata_str)); - } - } - } - - if descending { - versions.sort_by(|a, b| b.0.cmp(&a.0)); - } else { - versions.sort_by(|a, b| a.0.cmp(&b.0)); - } - - if let Some(limit) = limit { - versions.truncate(limit as usize); - } - - Ok(versions) - } - - /// Query the manifest for a specific version of a table. - /// - /// Returns the full metadata JSON string if found, which contains - /// manifest_path, manifest_size, e_tag, and naming_scheme. - /// - pub async fn query_table_version( - &self, - object_id: &str, - version: i64, - ) -> Result> { - let version_object_id = Self::build_version_object_id(object_id, version); - self.query_table_version_by_object_id(&version_object_id) - .await - } - - /// Query a specific table version by its exact object_id. - async fn query_table_version_by_object_id( - &self, - version_object_id: &str, - ) -> Result> { - let escaped_id = version_object_id.replace('\'', "''"); - let filter = format!( - "object_id = '{}' AND object_type = 'table_version'", - escaped_id - ); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to filter: {:?}", e), - }) - })?; - scanner.project(&["metadata"]).map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!("Failed to project: {:?}", e), - }) - })?; - let batches = Self::execute_scanner(scanner).await?; - - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let metadata_array = Self::get_string_column(&batch, "metadata")?; - return Ok(Some(metadata_array.value(0).to_string())); - } - - Ok(None) - } - - /// Delete table version entries from the manifest for a given table and version ranges. - /// - /// Each range is (start_version, end_version) inclusive. Deletes all matching - /// `object_type = 'table_version'` entries whose object_id matches - /// `{object_id}${zero_padded_version}`. - /// - /// Applies the ranges while streaming the manifest rewrite, without expanding - /// sparse ranges into every possible version object id. - pub async fn delete_table_versions( - &self, - object_id: &str, - ranges: &[(i64, i64)], - ) -> Result { - self.batch_delete_table_versions_by_ranges(&[(object_id.to_string(), ranges.to_vec())]) - .await - } - - /// Atomically delete table version entries from the manifest for multiple - /// tables and version ranges. - pub async fn batch_delete_table_versions_by_ranges( - &self, - table_ranges: &[(String, Vec<(i64, i64)>)], - ) -> Result { - let targets = table_ranges - .iter() - .filter_map(|(object_id, ranges)| { - let ranges = Self::normalize_table_version_ranges(ranges); - if ranges.is_empty() { - None - } else { - Some(DeleteTableVersionRangeTarget { - object_id_prefix: Self::build_version_object_id_prefix(object_id), - ranges, - }) - } - }) - .collect::>(); - if targets.is_empty() { - return Ok(0); - } - - self.rewrite_manifest("Failed to delete table versions from manifest", || { - DeleteTableVersionsMutation { - target: DeleteTableVersionsTarget::Ranges(targets.clone()), - deleted_count: 0, - } - }) - .await - } - - /// Atomically delete table version entries from the manifest by their object_ids. - /// - /// This method supports multi-table transactional deletion: all specified - /// object_ids (which may span multiple tables) are deleted in a single atomic - /// copy-on-write manifest rewrite. Either all entries are removed or none are. - /// - /// Object IDs are formatted as `{table_id}${version}`. - pub async fn batch_delete_table_versions_by_object_ids( - &self, - object_ids: &[String], - ) -> Result { - if object_ids.is_empty() { - return Ok(0); - } - - let object_ids = object_ids.iter().cloned().collect::>(); - self.rewrite_manifest("Failed to delete table versions from manifest", || { - DeleteTableVersionsMutation { - target: DeleteTableVersionsTarget::ObjectIds(object_ids.clone()), - deleted_count: 0, - } - }) - .await - } - - /// Set a property flag in the __manifest table's metadata key-value map. - /// - /// This uses `dataset.update_metadata()` to persist the flag in the - /// __manifest dataset's table metadata, rather than inserting a row. - /// If the property already exists with the same value, this is a no-op. - pub async fn set_property(&self, name: &str, value: &str) -> Result<()> { - let _mutation_guard = self.manifest_mutation_lock.lock().await; - let dataset_guard = self.manifest_dataset.get().await?; - if dataset_guard.metadata().get(name) == Some(&value.to_string()) { - return Ok(()); - } - drop(dataset_guard); - - let mut dataset_guard = self.manifest_dataset.get_mut().await?; - dataset_guard - .update_metadata([(name, value)]) - .await - .map_err(|e| { - lance_core::Error::from(NamespaceError::Internal { - message: format!( - "Failed to set property '{}' in __manifest metadata: {}", - name, e - ), - }) - })?; - Ok(()) - } - - /// Check if a property flag exists in the __manifest table's metadata key-value map. - pub async fn has_property(&self, name: &str) -> Result { - let dataset_guard = self.manifest_dataset.get().await?; - Ok(dataset_guard.metadata().contains_key(name)) - } - - /// Parse metadata JSON into a `TableVersion`. - /// - /// Returns `None` if metadata is invalid or missing required fields. - fn parse_table_version(version: i64, metadata_str: &str) -> Option { - let meta: serde_json::Value = match serde_json::from_str(metadata_str) { - Ok(v) => v, - Err(e) => { - log::warn!( - "Skipping version {} due to invalid metadata JSON: {}", - version, - e - ); - return None; - } - }; - let manifest_path = match meta.get("manifest_path").and_then(|v| v.as_str()) { - Some(p) => p.to_string(), - None => { - log::warn!( - "Skipping version {} due to missing 'manifest_path' in metadata — \ - this may indicate data corruption", - version - ); - return None; - } - }; - let manifest_size = meta.get("manifest_size").and_then(|v| v.as_i64()); - let e_tag = meta - .get("e_tag") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); - Some(TableVersion { - version, - manifest_path, - manifest_size, - e_tag, - timestamp_millis: None, - metadata: None, - }) - } - - /// List table versions from the __manifest table. - /// - /// Queries the manifest for all versions of the given table and returns - /// them as a `ListTableVersionsResponse`. - pub async fn list_table_versions( - &self, - table_id: &[String], - descending: bool, - limit: Option, - ) -> Result { - let object_id = Self::str_object_id(table_id); - let manifest_versions = self - .query_table_versions(&object_id, descending, limit) - .await?; - - let table_versions: Vec = manifest_versions - .into_iter() - .filter_map(|(version, metadata_str)| Self::parse_table_version(version, &metadata_str)) - .collect(); - - Ok(ListTableVersionsResponse { - versions: table_versions, - page_token: None, - }) - } - - /// Describe a specific table version from the __manifest table. - /// - /// Queries the manifest for a specific version and returns it as a - /// `DescribeTableVersionResponse`. Returns an error if the version is not found. - pub async fn describe_table_version( - &self, - table_id: &[String], - version: i64, - ) -> Result { - let object_id = Self::str_object_id(table_id); - if let Some(metadata_str) = self.query_table_version(&object_id, version).await? - && let Some(tv) = Self::parse_table_version(version, &metadata_str) - { - return Ok(DescribeTableVersionResponse { - version: Box::new(tv), - }); - } - Err(NamespaceError::TableVersionNotFound { - message: format!("version {} for table {:?}", version, table_id), - } - .into()) - } - /// Register a table in the manifest without creating the physical table (internal helper for migration) pub async fn register_table(&self, name: &str, location: String) -> Result<()> { let object_id = Self::build_object_id(&[], name); @@ -2839,12 +2381,10 @@ impl ManifestNamespace { /// 1. Try to load an existing manifest table /// 2. If it exists, check and migrate the schema if needed (e.g., add primary key metadata) /// 3. If it doesn't exist, create a new manifest table with the current schema - /// 4. Persist feature flags (e.g., table_version_storage_enabled) if requested async fn ensure_manifest_table_up_to_date( root: &str, storage_options: &Option>, session: Option>, - table_version_storage_enabled: bool, ) -> Result { let manifest_path = format!("{}/{}", root, MANIFEST_TABLE_NAME); log::debug!("Attempting to load manifest from {}", manifest_path); @@ -2899,27 +2439,6 @@ impl ManifestNamespace { })?; } - // Persist table_version_storage_enabled flag in __manifest so that once - // enabled, it becomes a permanent property of this namespace. - if table_version_storage_enabled { - let needs_flag = dataset - .metadata() - .get("table_version_storage_enabled") - .map(|v| v != "true") - .unwrap_or(true); - - if needs_flag - && let Err(e) = dataset - .update_metadata([("table_version_storage_enabled", "true")]) - .await - { - log::warn!( - "Failed to persist table_version_storage_enabled flag in __manifest: {:?}", - e - ); - } - } - Ok(DatasetConsistencyWrapper::new(dataset)) } else { log::info!("Creating new manifest table at {}", manifest_path); @@ -4067,7 +3586,6 @@ mod tests { true, inline_optimization_enabled, commit_retries, - false, ) .await .unwrap() @@ -4424,90 +3942,6 @@ mod tests { ); } - #[tokio::test] - async fn test_manifest_delete_table_versions_by_ranges() { - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); - let manifest_ns = create_manifest_namespace(temp_path, false).await; - let table_id = "table"; - let entries = (1..=5) - .map(|version| ManifestEntry { - object_id: ManifestNamespace::build_version_object_id(table_id, version), - object_type: ObjectType::TableVersion, - location: None, - metadata: Some( - serde_json::json!({ - "manifest_path": format!("_versions/{}.manifest", version), - }) - .to_string(), - ), - }) - .collect::>(); - manifest_ns - .insert_into_manifest_with_metadata(entries, None) - .await - .unwrap(); - - let deleted = manifest_ns - .delete_table_versions(table_id, &[(2, 3), (5, 5)]) - .await - .unwrap(); - assert_eq!(deleted, 3); - - let remaining = manifest_ns - .query_table_versions(table_id, false, None) - .await - .unwrap() - .into_iter() - .map(|(version, _)| version) - .collect::>(); - assert_eq!(remaining, vec![1, 4]); - } - - #[tokio::test] - async fn test_manifest_delete_table_versions_by_object_ids() { - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); - let manifest_ns = create_manifest_namespace(temp_path, false).await; - let table_id = "table"; - let entries = (1..=3) - .map(|version| ManifestEntry { - object_id: ManifestNamespace::build_version_object_id(table_id, version), - object_type: ObjectType::TableVersion, - location: None, - metadata: Some( - serde_json::json!({ - "manifest_path": format!("_versions/{}.manifest", version), - }) - .to_string(), - ), - }) - .collect::>(); - manifest_ns - .insert_into_manifest_with_metadata(entries, None) - .await - .unwrap(); - - let object_ids = vec![ - ManifestNamespace::build_version_object_id(table_id, 1), - ManifestNamespace::build_version_object_id(table_id, 3), - ]; - let deleted = manifest_ns - .batch_delete_table_versions_by_object_ids(&object_ids) - .await - .unwrap(); - assert_eq!(deleted, 2); - - let remaining = manifest_ns - .query_table_versions(table_id, false, None) - .await - .unwrap() - .into_iter() - .map(|(version, _)| version) - .collect::>(); - assert_eq!(remaining, vec![2]); - } - #[tokio::test] async fn test_manifest_noop_delete_uses_latest_snapshot() { let temp_dir = TempStdDir::default(); diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 7324ab0bb0e..44ebd866810 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -1527,8 +1527,7 @@ mod tests { } /// Like [`Self::new`], with managed versioning (table version - /// tracking through the `__manifest` catalog) enabled on the - /// backend. + /// tracking) enabled on the backend. async fn new_managed() -> Self { Self::build(true).await } @@ -1540,9 +1539,7 @@ mod tests { // Create DirectoryNamespace backend with manifest enabled let mut builder = DirectoryNamespaceBuilder::new(&temp_path).manifest_enabled(true); if managed_versioning { - builder = builder - .table_version_tracking_enabled(true) - .table_version_storage_enabled(true); + builder = builder.table_version_tracking_enabled(true); } let backend = builder.build().await.unwrap(); let backend = Arc::new(backend); From ccefc534488e59f9dceed574911fe1689e0530fa Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Mon, 15 Jun 2026 15:10:54 +0800 Subject: [PATCH 098/177] build: remove brotli from dependency graph (#7270) This PR removes Brotli from Lance's main Rust release dependency graph. The root cause was not Lance directly using Brotli. Brotli was pulled into the Rust workspace through three indirect paths: - `datafusion-substrait` enables its `physical` feature by default, which enables `datafusion/parquet`, bringing in `datafusion-datasource-parquet` and Parquet's default compression features. - `rust/examples` had a direct `parquet` dependency with default features enabled. - `lance-datagen` depended on `random_word`, whose compressed word lists pull in Brotli. This PR removes those paths by disabling `datafusion-substrait` default features, making direct Parquet usage explicit with only `arrow` and `async`, and replacing `random_word` with a small local word list. For the main Lance release dependency graph, `lance` does not depend on Parquet: `cargo tree -p lance --edges normal,build -i parquet` prints nothing. Parquet remains only where it is explicitly needed by tests/examples, with default compression features disabled. The root all-features workspace graph no longer contains `brotli` or `datafusion-datasource-parquet`. One important exception is `pylance`: the Python package intentionally keeps `FFILanceTableProvider`, which depends on `datafusion-ffi`. Today `datafusion-ffi` enables `datafusion-proto/default`, and `datafusion-proto` defaults to Parquet support. Because Cargo features are additive, `pylance` cannot disable that upstream default feature from its own manifest. This PR keeps the Python FFI API stable and pins the Python-side allocator dependency resolution to the known working Brotli combination while #7271 tracks removing this remaining Python-only Brotli path separately. --- Cargo.lock | 103 ------------- Cargo.toml | 4 +- python/Cargo.lock | 134 +---------------- python/Cargo.toml | 6 +- rust/examples/Cargo.toml | 2 +- rust/lance-datagen/Cargo.toml | 1 - rust/lance-datagen/src/generator.rs | 226 ++++++++++++++++++++++++++-- 7 files changed, 221 insertions(+), 255 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2050bd0698e..59ca17eec12 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,21 +66,6 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "514ce16346f9fc96702fd52f2ae7e383b185516ee6f556efd7c3176be8fe7bea" -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "alloca" version = "0.4.0" @@ -1194,27 +1179,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -2026,7 +1990,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -2048,7 +2011,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -2123,7 +2085,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -2241,36 +2202,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -3078,7 +3009,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -4706,7 +4636,6 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] @@ -6440,26 +6369,19 @@ dependencies = [ "arrow-schema", "arrow-select", "base64 0.22.1", - "brotli", "bytes", "chrono", - "flate2", "futures", "half", "hashbrown 0.17.1", - "lz4_flex", "num-bigint", "num-integer", "num-traits", - "object_store", "paste", "seq-macro", - "simdutf8", - "snap", "thrift", "tokio", "twox-hash", - "zstd", ] [[package]] @@ -7197,19 +7119,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -8379,12 +8288,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -10534,12 +10437,6 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/Cargo.toml b/Cargo.toml index c9740be6642..3b57ab3498e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -133,13 +133,13 @@ datafusion = { version = "53.0.0", default-features = false, features = [ "unicode_expressions", ] } datafusion-common = "53.0.0" -datafusion-functions = { version = "53.0.0", features = ["regex_expressions"] } +datafusion-functions = { version = "53.0.0", default-features = false, features = ["regex_expressions"] } datafusion-sql = "53.0.0" datafusion-expr = "53.0.0" datafusion-ffi = "53.0.0" datafusion-physical-expr = "53.0.0" datafusion-physical-plan = "53.0.0" -datafusion-substrait = "53.0.0" +datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } diff --git a/python/Cargo.lock b/python/Cargo.lock index f2f9990e7cd..c8f484c73d0 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -185,15 +185,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ar_archive_writer" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4087686b4b0a3427190bae57a1d9a478dbb2d40c5dc1bd6e2b6d797913bdd348" -dependencies = [ - "object", -] - [[package]] name = "arc-swap" version = "1.9.1" @@ -1232,15 +1223,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" -dependencies = [ - "libbz2-rs-sys", -] - [[package]] name = "cbc" version = "0.1.2" @@ -1430,13 +1412,9 @@ version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" dependencies = [ - "bzip2", "compression-core", "flate2", - "liblzma", "memchr", - "zstd", - "zstd-safe", ] [[package]] @@ -1826,7 +1804,6 @@ dependencies = [ "arrow-schema", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1836,7 +1813,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1853,14 +1829,11 @@ dependencies = [ "datafusion-physical-plan", "datafusion-session", "datafusion-sql", - "flate2", "futures", "itertools 0.14.0", - "liblzma", "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1868,7 +1841,6 @@ dependencies = [ "tokio", "url", "uuid", - "zstd", ] [[package]] @@ -1938,7 +1910,6 @@ dependencies = [ "object_store", "parquet", "paste", - "recursive", "sqlparser", "tokio", "web-time", @@ -1962,10 +1933,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", - "async-compression", "async-trait", "bytes", - "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1976,18 +1945,14 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", - "flate2", "futures", "glob", "itertools 0.14.0", - "liblzma", "log", "object_store", "rand 0.9.4", "tokio", - "tokio-util", "url", - "zstd", ] [[package]] @@ -2138,7 +2103,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "paste", - "recursive", "serde_json", "sqlparser", ] @@ -2348,7 +2312,6 @@ dependencies = [ "indexmap 2.14.0", "itertools 0.14.0", "log", - "recursive", "regex", "regex-syntax", ] @@ -2373,7 +2336,6 @@ dependencies = [ "parking_lot", "paste", "petgraph", - "recursive", "tokio", ] @@ -2425,7 +2387,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools 0.14.0", - "recursive", ] [[package]] @@ -2544,7 +2505,6 @@ dependencies = [ "datafusion-functions-nested", "indexmap 2.14.0", "log", - "recursive", "regex", "sqlparser", ] @@ -4323,7 +4283,6 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] @@ -4737,12 +4696,6 @@ dependencies = [ "lexical-util", ] -[[package]] -name = "libbz2-rs-sys" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34b357333733e8260735ba5894eb928c02ecc69c78715f01a8019e7fa7f2db4c" - [[package]] name = "libc" version = "0.2.186" @@ -4759,26 +4712,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "liblzma" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" -dependencies = [ - "liblzma-sys", -] - -[[package]] -name = "liblzma-sys" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "libm" version = "0.2.16" @@ -5277,15 +5210,6 @@ dependencies = [ "objc2-core-foundation", ] -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - [[package]] name = "object_store" version = "0.13.2" @@ -6111,16 +6035,6 @@ dependencies = [ "prost", ] -[[package]] -name = "psm" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" -dependencies = [ - "ar_archive_writer", - "cc", -] - [[package]] name = "ptr_meta" version = "0.3.1" @@ -6145,6 +6059,7 @@ dependencies = [ name = "pylance" version = "8.0.0-beta.13" dependencies = [ + "alloc-stdlib", "arrow", "arrow-array", "arrow-cast", @@ -6465,19 +6380,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -6510,26 +6412,6 @@ dependencies = [ "crossbeam-utils", ] -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.117", -] - [[package]] name = "redb" version = "3.1.3" @@ -7614,7 +7496,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", - "recursive", "sqlparser_derive", ] @@ -7635,19 +7516,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "stacker" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" -dependencies = [ - "cc", - "cfg-if 1.0.4", - "libc", - "psm", - "windows-sys 0.61.2", -] - [[package]] name = "static_assertions" version = "1.1.0" diff --git a/python/Cargo.toml b/python/Cargo.toml index 5530f6ff60a..82f5036a62b 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -19,9 +19,13 @@ arrow-cast = "58.0.0" arrow-data = "58.0.0" arrow-schema = "58.0.0" object_store = "0.13.2" -datafusion = "53.0.0" +datafusion = { version = "53.0.0", default-features = false } datafusion-ffi = "53.0.0" datafusion-common = "53.0.0" +# Keep the Python FFI build on the working Brotli allocator resolution until +# datafusion-ffi no longer enables datafusion-proto/default. +# See https://github.com/lance-format/lance/issues/7271. +alloc-stdlib = "=0.2.2" async-trait = "0.1" chrono = "0.4.42" env_logger = "0.11.7" diff --git a/rust/examples/Cargo.toml b/rust/examples/Cargo.toml index a4e760f8cbe..80eff457140 100644 --- a/rust/examples/Cargo.toml +++ b/rust/examples/Cargo.toml @@ -49,6 +49,6 @@ tokio = { workspace = true } all_asserts = "2.3.1" env_logger = "0.11.7" hf-hub = "0.4.2" -parquet = "58.0.0" +parquet = { version = "58.0.0", default-features = false, features = ["arrow", "async"] } tokenizers = "0.15.2" rand.workspace = true diff --git a/rust/lance-datagen/Cargo.toml b/rust/lance-datagen/Cargo.toml index eae1e3086b6..83b5aba3689 100644 --- a/rust/lance-datagen/Cargo.toml +++ b/rust/lance-datagen/Cargo.toml @@ -21,7 +21,6 @@ hex = "0.4.3" rand = { workspace = true } rand_distr = { workspace = true } rand_xoshiro = { workspace = true } -random_word = { version = "0.5", features = ["en"] } [dev-dependencies] criterion = { workspace = true } diff --git a/rust/lance-datagen/src/generator.rs b/rust/lance-datagen/src/generator.rs index 3756e354bea..39da4734619 100644 --- a/rust/lance-datagen/src/generator.rs +++ b/rust/lance-datagen/src/generator.rs @@ -21,7 +21,6 @@ use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, Sc use futures::{StreamExt, stream::BoxStream}; use rand::{Rng, RngCore, SeedableRng, distr::Uniform}; use rand_distr::Zipf; -use random_word; use self::array::rand_with_distribution; @@ -1172,24 +1171,223 @@ impl ArrayGenerator for BinaryPrefixPlusCounterGenerator { } } -// Common English stop words placed at the front to be sampled more frequently +// Common English stop words placed at the front to be sampled more frequently. const STOP_WORDS: &[&str] = &[ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", ]; +const ENGLISH_WORDS: &[&str] = &[ + "ability", + "able", + "about", + "above", + "accept", + "access", + "account", + "across", + "action", + "active", + "activity", + "actual", + "address", + "adjust", + "admin", + "advance", + "agent", + "align", + "allow", + "amount", + "analysis", + "answer", + "application", + "archive", + "array", + "asset", + "async", + "attribute", + "available", + "balance", + "batch", + "binary", + "bitmap", + "block", + "branch", + "buffer", + "build", + "cache", + "capacity", + "catalog", + "change", + "chunk", + "client", + "cluster", + "column", + "commit", + "common", + "compare", + "compile", + "compute", + "condition", + "config", + "connect", + "content", + "context", + "control", + "convert", + "copy", + "core", + "count", + "create", + "current", + "cursor", + "data", + "dataset", + "decode", + "default", + "delete", + "delta", + "depend", + "derive", + "design", + "detail", + "detect", + "device", + "direct", + "display", + "document", + "domain", + "drive", + "dynamic", + "encode", + "engine", + "error", + "event", + "example", + "execute", + "expand", + "expect", + "export", + "extend", + "feature", + "field", + "filter", + "final", + "finish", + "format", + "fragment", + "future", + "generate", + "global", + "group", + "handle", + "header", + "index", + "input", + "insert", + "inspect", + "instance", + "integer", + "internal", + "item", + "join", + "kernel", + "large", + "layer", + "layout", + "length", + "level", + "limit", + "linear", + "local", + "logical", + "lookup", + "manage", + "manifest", + "memory", + "merge", + "metric", + "model", + "module", + "namespace", + "native", + "node", + "normal", + "number", + "object", + "offset", + "option", + "output", + "package", + "page", + "parallel", + "parse", + "partition", + "pattern", + "physical", + "plan", + "policy", + "prefix", + "prepare", + "primary", + "process", + "profile", + "project", + "property", + "query", + "range", + "reader", + "record", + "region", + "registry", + "request", + "resolve", + "resource", + "result", + "return", + "row", + "runtime", + "scalar", + "scan", + "schema", + "search", + "segment", + "select", + "session", + "setting", + "source", + "stable", + "stage", + "state", + "static", + "storage", + "stream", + "string", + "struct", + "table", + "target", + "task", + "thread", + "token", + "trace", + "transform", + "type", + "update", + "upload", + "value", + "vector", + "version", + "view", + "write", + "writer", +]; + /// Word list with stop words at the front for Zipf sampling, computed once. static SENTENCE_WORDS: LazyLock> = LazyLock::new(|| { - let all_words = random_word::all(random_word::Lang::En); - let mut words = Vec::with_capacity(STOP_WORDS.len() + all_words.len()); + let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len()); words.extend(STOP_WORDS.iter().copied()); - words.extend( - all_words - .iter() - .filter(|w| !STOP_WORDS.contains(w)) - .copied(), - ); + words.extend(ENGLISH_WORDS.iter().copied()); words }); @@ -1279,7 +1477,7 @@ struct RandomWordGenerator { impl RandomWordGenerator { pub fn new(is_large: bool) -> Self { - let words = random_word::all(random_word::Lang::En); + let words = ENGLISH_WORDS; Self { words, is_large } } } @@ -3190,9 +3388,9 @@ mod tests { assert_eq!( *genn.generate(RowCount::from(3), &mut rng).unwrap(), arrow_array::BinaryArray::from_iter_values([ - vec![174, 178], - vec![64, 122, 207, 248], - vec![124, 3, 58] + vec![111, 9, 80], + vec![86, 118, 13, 209], + vec![68, 33, 202] ]) ); } From a8cbd3bf1c03b3b6c1dbc2994aaa09bb483a6dd0 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Mon, 15 Jun 2026 18:33:33 +0800 Subject: [PATCH 099/177] perf(vector): vectorize RaBitQ dist table quantization (#7241) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What `quantize_dist_table_into` / `quantize_dist_table_u16_into` quantize the per-(query, partition) `dim * 4`-entry f32 FastScan distance table into u8 (fast/normal approx modes) or u16 (accurate mode) LUT entries. Both were scalar: `itertools::minmax_by(total_cmp)` for the min/max pass (branchy pairwise compares that never vectorize; `minmax_impl` alone is 6.0-6.6% of query time in cpu-clock profiles of IVF_RQ on dbpedia-openai-1M, dim=1536, num_bits=5, nprobes=24) plus a scalar quantize-write loop. This moves them into a dedicated module `vector/bq/dist_table_quant.rs` with the same runtime-dispatch treatment as the ex-dot kernels (#7205): - min/max: two-accumulator 16-lane AVX-512 / 8-lane AVX2 folds; elsewhere a portable 16-lane fold that LLVM auto-vectorizes (NEON is part of the aarch64 baseline). - quantize: `(d - qmin) * factor`, `cvtps_epi32` (nearest-even, MXCSR default), then unsigned-saturating narrows (`vpmovusdb`/`vpmovusdw` on AVX-512; `packus` + lane-restore permutes on AVX2); scalar fallback uses `round_ties_even`. ## Numeric semantics - Rounding changes from `f32::round` (half away from zero) to half-to-even so the scalar fallback and all SIMD paths are bit-exact with each other. Relative to the old code this can move a LUT entry by 1 only on exact .5 ties, within the table's inherent quantization error. - SIMD min/max vs `total_cmp` differs only on NaN (inputs are finite sums of rotated-query components) and the sign of zero, which callers cannot observe (`d - qmin` and the `qmin == qmax` early-out are unchanged either way). - Degenerate ranges (table spread below ~`255/f32::MAX`, which overflows `factor` to inf, or above `f32::MAX`) now collapse to the zeroed-LUT early-out. Previously these produced garbage (NaN -> 0 casts); the SIMD narrows would additionally disagree across kernels on the NaN products, so the early-out keeps every path deterministic and bit-exact. ## Benchmarks GCP c4-standard-16 (AVX-512), `taskset -c 4`, criterion baseline = main (b6a99cda9), `cargo bench -p lance-index --bench rq`: | bench (DIM=1536, rows=16384) | before | after | change | |-------------------------------------|-----------|-----------|--------| | binary-only distance_all num_bits=3 | 130.89 us | 117.31 us | -10.3% | | binary-only distance_all num_bits=5 | 127.69 us | 110.43 us | -13.6% | | binary-only distance_all num_bits=9 | 127.91 us | 106.43 us | -16.4% | | full distance_all num_bits=3 | 413.89 us | 399.48 us | -3.9% | The untouched `RQ bulk ex kernel loop` control group moved within +-3% between runs, so the binary-only wins are far above the machine noise floor. Full-path results for num_bits=5/9 are within that noise envelope (the binary stage is a small share of those). On Apple M-series (NEON: portable min/max fold + autovectorized scalar quantize), the same paired criterion run improves binary-only distance_all by -24% / -13% / -8% (num_bits=3/5/9); the Mac's run-to-run noise is larger than the pinned GCP runs but all three move well past it. ## Tests - New differential tests run every available kernel (scalar, AVX2, AVX-512 when detected) against a straightforward `total_cmp` + `round_ties_even` reference and require bit-exact agreement: random inputs (lengths 1..6160 including non-multiples of 16/32, scales 1e-3..1e4), exact .5 ties (integer tables constructed so `factor == 0.5`), all-equal inputs, signed-zero mixes, degenerate ranges, and scratch-buffer reuse. - `cargo test -p lance-index --lib vector::bq`: 207 pass on aarch64 (scalar + NEON dispatch) and on AVX-512/AVX2 hardware (GCP c4, both debug and release). - `cargo fmt --all` and `cargo clippy --all --tests --benches -- -D warnings` are clean. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Fable 5 --- rust/lance-index/src/vector/bq.rs | 1 + .../src/vector/bq/dist_table_quant.rs | 935 ++++++++++++++++++ rust/lance-index/src/vector/bq/storage.rs | 195 ++-- 3 files changed, 1067 insertions(+), 64 deletions(-) create mode 100644 rust/lance-index/src/vector/bq/dist_table_quant.rs diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index 71c4eed7fd8..ad013683214 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -18,6 +18,7 @@ use crate::vector::bq::storage::RabitQuantizationMetadata; use crate::vector::quantizer::QuantizerBuildParams; pub mod builder; +pub(crate) mod dist_table_quant; pub mod ex_dot; pub mod rotation; pub mod storage; diff --git a/rust/lance-index/src/vector/bq/dist_table_quant.rs b/rust/lance-index/src/vector/bq/dist_table_quant.rs new file mode 100644 index 00000000000..22196f06edb --- /dev/null +++ b/rust/lance-index/src/vector/bq/dist_table_quant.rs @@ -0,0 +1,935 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! SIMD kernels for quantizing the RaBitQ FastScan distance table. +//! +//! Once per (query, probed partition) the `dim * 4`-entry `f32` distance +//! table is quantized into `u8` (fast/normal approx modes) or `u16` +//! (accurate mode) FastScan LUT entries: a min/max pass over the table +//! followed by an affine quantize-and-narrow pass. Both passes are branchy +//! in scalar form, so they get the same runtime-dispatch treatment as +//! [`super::ex_dot`]: explicit AVX-512/AVX2 kernels on x86_64 and a portable +//! fold elsewhere that LLVM auto-vectorizes (NEON is part of the aarch64 +//! baseline). +//! +//! Table values are sums of rotated-query components: always finite, never +//! NaN, so lanewise IEEE `min`/`max` matches `total_cmp` ordering. The only +//! divergence is the sign of zero, which callers cannot observe: `d - qmin` +//! and the `qmin == qmax` early-out are arithmetically identical either way. +//! +//! Quantization rounds half-to-even so that the scalar fallback and the SIMD +//! kernels agree bit-exactly. All paths round with fixed-mode rounding, +//! independent of the dynamic MXCSR rounding mode native code may have +//! installed: the SIMD kernels use the converts' static rounding and the +//! scalar path (also the SIMD tails) rounds via `f32::floor` rather than +//! `f32::round_ties_even`, which can lower to an MXCSR-honoring instruction on +//! x86. Relative to the pre-SIMD implementation (`f32::round`, +//! half-away-from-zero) this can move a LUT entry by 1 on exact .5 ties, which +//! is within the table's inherent quantization error. + +use std::mem::MaybeUninit; +use std::sync::LazyLock; + +use super::storage::SEGMENT_NUM_CODES; + +type MinMaxFn = fn(&[f32]) -> (f32, f32); +type QuantizeU8Fn = fn(&[f32], f32, f32, &mut [MaybeUninit]); +type QuantizeU16Fn = fn(&[f32], f32, f32, &mut [MaybeUninit]); + +/// How the caller reconstructs binary inner-product distances from the +/// FastScan accumulator sums computed against the quantized LUT. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum DistTableDequant { + /// Reconstruct each distance with the affine map + /// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin`. Returned whenever + /// that map is finite, including a zero/sub-resolution range — then the + /// LUT is zeroed and every distance collapses to the constant + /// `num_tables * qmin`. + Affine { qmin: f32, qmax: f32 }, + /// `num_tables * {qmin, qmax, qmax - qmin}` overflowed f32, so the affine + /// reconstruction would yield NaN/inf. The LUT is zeroed; the caller must + /// compute exact distances directly from the f32 table. + Exact, +} + +/// Quantize `dist_table` into `u8` FastScan LUT entries in the caller-owned +/// scratch buffer, returning how the caller must dequantize the FastScan +/// sums (see [`DistTableDequant`]). `dist_table` must be non-empty and all +/// values finite. +pub fn quantize_dist_table_into( + dist_table: &[f32], + quantized_dist_table: &mut Vec, +) -> DistTableDequant { + debug_assert!(!dist_table.is_empty(), "dist table must be non-empty"); + let (qmin, qmax) = min_max(dist_table); + if dequant_overflows(dist_table.len(), qmin, qmax) { + // The caller's affine reconstruction would be non-finite; it computes + // exact distances and ignores the LUT, but keep the buffer valid. + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Exact; + } + let factor = u8::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + // Zero or sub-u8-resolution range (e.g. an all-zeros query): the LUT + // carries no information, but the finite affine map sends every sum + // to the constant `num_tables * qmin`. + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Affine { qmin, qmax }; + } + quantized_dist_table.clear(); + quantized_dist_table.reserve(dist_table.len()); + quantize_u8( + dist_table, + qmin, + factor, + &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()], + ); + // SAFETY: the kernel initialized every element in the reserved range. + unsafe { + quantized_dist_table.set_len(dist_table.len()); + } + DistTableDequant::Affine { qmin, qmax } +} + +/// `u16` variant of [`quantize_dist_table_into`] for the accurate approx +/// mode. +pub fn quantize_dist_table_u16_into( + dist_table: &[f32], + quantized_dist_table: &mut Vec, +) -> DistTableDequant { + debug_assert!(!dist_table.is_empty(), "dist table must be non-empty"); + let (qmin, qmax) = min_max(dist_table); + if dequant_overflows(dist_table.len(), qmin, qmax) { + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Exact; + } + let factor = u16::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + quantized_dist_table.clear(); + quantized_dist_table.resize(dist_table.len(), 0); + return DistTableDequant::Affine { qmin, qmax }; + } + quantized_dist_table.clear(); + quantized_dist_table.reserve(dist_table.len()); + quantize_u16( + dist_table, + qmin, + factor, + &mut quantized_dist_table.spare_capacity_mut()[..dist_table.len()], + ); + // SAFETY: the kernel initialized every element in the reserved range. + unsafe { + quantized_dist_table.set_len(dist_table.len()); + } + DistTableDequant::Affine { qmin, qmax } +} + +/// Whether the caller's affine dequantization +/// `q_sum * (qmax - qmin) / SCALE + num_tables * qmin` would overflow `f32` +/// for some row. Each row's reconstructed binary IP lies in +/// `[num_tables * qmin, num_tables * qmax]` and its quantized term is at most +/// `num_tables * (qmax - qmin)`, so if any of those is non-finite the table +/// must fall back to exact distances. The bound is scale-independent — the +/// `1 / SCALE` factor and the `q_sum <= num_tables * SCALE` range cancel. +/// Real dist tables are bounded sums of rotated-query components and never +/// approach this; the guard exists so a pathological query degrades to exact +/// distances instead of producing NaN. +fn dequant_overflows(table_len: usize, qmin: f32, qmax: f32) -> bool { + let num_tables = (table_len / SEGMENT_NUM_CODES) as f32; + !(num_tables * qmin).is_finite() + || !(num_tables * qmax).is_finite() + || !(num_tables * (qmax - qmin)).is_finite() +} + +fn min_max(values: &[f32]) -> (f32, f32) { + static KERNEL: LazyLock = LazyLock::new(select_min_max); + KERNEL(values) +} + +fn quantize_u8(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + static KERNEL: LazyLock = LazyLock::new(select_quantize_u8); + KERNEL(values, qmin, factor, out) +} + +fn quantize_u16(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + static KERNEL: LazyLock = LazyLock::new(select_quantize_u16); + KERNEL(values, qmin, factor, out) +} + +fn select_min_max() -> MinMaxFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::min_max_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::min_max_avx2_dispatch; + } + } + min_max_fold +} + +fn select_quantize_u8() -> QuantizeU8Fn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::quantize_u8_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::quantize_u8_avx2_dispatch; + } + } + quantize_u8_scalar +} + +fn select_quantize_u16() -> QuantizeU16Fn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::quantize_u16_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::quantize_u16_avx2_dispatch; + } + } + quantize_u16_scalar +} + +const FOLD_LANES: usize = 16; + +/// Portable 16-lane min/max fold; the scalar fallback and the aarch64 path. +/// The `if` comparisons (rather than `f32::min`/`max`, which carry NaN +/// bookkeeping) lower to lanewise min/max instructions on targets with +/// baseline SIMD. +fn min_max_fold(values: &[f32]) -> (f32, f32) { + let mut mins = [f32::INFINITY; FOLD_LANES]; + let mut maxs = [f32::NEG_INFINITY; FOLD_LANES]; + let mut chunks = values.chunks_exact(FOLD_LANES); + for chunk in &mut chunks { + let chunk: &[f32; FOLD_LANES] = chunk.try_into().expect("chunks_exact length"); + for (i, &v) in chunk.iter().enumerate() { + mins[i] = if v < mins[i] { v } else { mins[i] }; + maxs[i] = if v > maxs[i] { v } else { maxs[i] }; + } + } + let mut min = f32::INFINITY; + let mut max = f32::NEG_INFINITY; + for v in mins { + min = if v < min { v } else { min }; + } + for v in maxs { + max = if v > max { v } else { max }; + } + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) +} + +/// Round `x` to the nearest integer, ties to even — the same rule the SIMD +/// converts use — with fixed-mode operations only, so the result never +/// depends on the dynamic rounding mode native code may have installed. +/// +/// On x86, `f32::round_ties_even` can lower to an MXCSR-honoring instruction +/// (outside an SSE4.1 context), so nearest-even is built from `f32::floor`, +/// which is always fixed-mode. `x` is a non-negative quantization product, so +/// only the upward tie case is reachable, but the form is correct for any +/// finite `x` whose floor fits in `i64`. Elsewhere (e.g. aarch64) the standard +/// `round_ties_even` is already a fixed-mode instruction (`frintn`) that the +/// quantize loop — which has no dedicated SIMD kernel there — vectorizes, so +/// it is kept. +#[inline(always)] +fn round_ties_even_fixed(x: f32) -> f32 { + #[cfg(target_arch = "x86_64")] + { + let lower = x.floor(); + let frac = x - lower; + let round_up = frac > 0.5 || (frac == 0.5 && (lower as i64 & 1) != 0); + lower + f32::from(round_up) + } + #[cfg(not(target_arch = "x86_64"))] + { + x.round_ties_even() + } +} + +fn quantize_u8_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + debug_assert_eq!(values.len(), out.len()); + for (quantized, &d) in out.iter_mut().zip(values) { + quantized.write(round_ties_even_fixed((d - qmin) * factor) as u8); + } +} + +fn quantize_u16_scalar(values: &[f32], qmin: f32, factor: f32, out: &mut [MaybeUninit]) { + debug_assert_eq!(values.len(), out.len()); + for (quantized, &d) in out.iter_mut().zip(values) { + quantized.write(round_ties_even_fixed((d - qmin) * factor) as u16); + } +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use std::arch::x86_64::*; + use std::mem::MaybeUninit; + + use super::{quantize_u8_scalar, quantize_u16_scalar}; + + pub(super) fn min_max_avx512_dispatch(values: &[f32]) -> (f32, f32) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { min_max_avx512(values) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn min_max_avx512(values: &[f32]) -> (f32, f32) { + // Two accumulators per direction break the lanewise min/max latency + // chain; they are reduced once at the end. + let mut min0 = _mm512_set1_ps(f32::INFINITY); + let mut min1 = min0; + let mut max0 = _mm512_set1_ps(f32::NEG_INFINITY); + let mut max1 = max0; + let mut chunks = values.chunks_exact(32); + for chunk in &mut chunks { + // SAFETY: the chunk holds 32 consecutive floats. + let (v0, v1) = unsafe { + ( + _mm512_loadu_ps(chunk.as_ptr()), + _mm512_loadu_ps(chunk.as_ptr().add(16)), + ) + }; + min0 = _mm512_min_ps(min0, v0); + max0 = _mm512_max_ps(max0, v0); + min1 = _mm512_min_ps(min1, v1); + max1 = _mm512_max_ps(max1, v1); + } + let mut min = _mm512_reduce_min_ps(_mm512_min_ps(min0, min1)); + let mut max = _mm512_reduce_max_ps(_mm512_max_ps(max0, max1)); + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) + } + + pub(super) fn min_max_avx2_dispatch(values: &[f32]) -> (f32, f32) { + // SAFETY: only selected when AVX2 was detected. + unsafe { min_max_avx2(values) } + } + + #[target_feature(enable = "avx2")] + unsafe fn min_max_avx2(values: &[f32]) -> (f32, f32) { + let mut min0 = _mm256_set1_ps(f32::INFINITY); + let mut min1 = min0; + let mut max0 = _mm256_set1_ps(f32::NEG_INFINITY); + let mut max1 = max0; + let mut chunks = values.chunks_exact(16); + for chunk in &mut chunks { + // SAFETY: the chunk holds 16 consecutive floats. + let (v0, v1) = unsafe { + ( + _mm256_loadu_ps(chunk.as_ptr()), + _mm256_loadu_ps(chunk.as_ptr().add(8)), + ) + }; + min0 = _mm256_min_ps(min0, v0); + max0 = _mm256_max_ps(max0, v0); + min1 = _mm256_min_ps(min1, v1); + max1 = _mm256_max_ps(max1, v1); + } + let mut min = reduce_min_avx2(_mm256_min_ps(min0, min1)); + let mut max = reduce_max_avx2(_mm256_max_ps(max0, max1)); + for &v in chunks.remainder() { + min = if v < min { v } else { min }; + max = if v > max { v } else { max }; + } + (min, max) + } + + #[inline] + #[target_feature(enable = "avx2")] + fn reduce_min_avx2(v: __m256) -> f32 { + let halves = _mm_min_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_min_ps(halves, _mm_movehl_ps(halves, halves)); + let single = _mm_min_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(single) + } + + #[inline] + #[target_feature(enable = "avx2")] + fn reduce_max_avx2(v: __m256) -> f32 { + let halves = _mm_max_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps::<1>(v)); + let pairs = _mm_max_ps(halves, _mm_movehl_ps(halves, halves)); + let single = _mm_max_ss(pairs, _mm_shuffle_ps::<0b01>(pairs, pairs)); + _mm_cvtss_f32(single) + } + + /// Load 16 floats and affine-quantize them into `i32` lanes, rounding to + /// nearest-even with static rounding (`_MM_FROUND_TO_NEAREST_INT`) so the + /// result does not depend on the dynamic MXCSR rounding mode and matches + /// the scalar [`super::round_ties_even_fixed`]. + #[inline] + #[target_feature(enable = "avx512f")] + unsafe fn quantize16_epi32(src: *const f32, min: __m512, factor: __m512) -> __m512i { + // SAFETY: the caller guarantees 16 floats are readable at `src`. + let v = unsafe { _mm512_loadu_ps(src) }; + let scaled = _mm512_mul_ps(_mm512_sub_ps(v, min), factor); + _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled) + } + + pub(super) fn quantize_u8_avx512_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { quantize_u8_avx512(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn quantize_u8_avx512( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm512_set1_ps(qmin); + let factor_v = _mm512_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q = quantize16_epi32(src.add(i), min, factor_v); + // Unsigned-saturating i32 -> u8 narrow: lanes are in + // [0, 255] plus float epsilon, which saturation clips. + _mm_storeu_si128(dst.add(i).cast(), _mm512_cvtusepi32_epi8(q)); + } + } + quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + pub(super) fn quantize_u16_avx512_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { quantize_u16_avx512(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx512f")] + unsafe fn quantize_u16_avx512( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm512_set1_ps(qmin); + let factor_v = _mm512_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q = quantize16_epi32(src.add(i), min, factor_v); + _mm256_storeu_si256(dst.add(i).cast(), _mm512_cvtusepi32_epi16(q)); + } + } + quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + /// Load 8 floats and affine-quantize them into `i32` lanes. AVX2 has no + /// embedded-rounding convert, so round to nearest-even explicitly with + /// `_mm256_round_ps` (which ignores MXCSR); the subsequent convert then + /// sees an integral value, so its dynamic rounding mode cannot change the + /// result, keeping it bit-identical to the scalar + /// [`super::round_ties_even_fixed`]. + #[inline] + #[target_feature(enable = "avx2")] + unsafe fn quantize8_epi32(src: *const f32, min: __m256, factor: __m256) -> __m256i { + // SAFETY: the caller guarantees 8 floats are readable at `src`. + let v = unsafe { _mm256_loadu_ps(src) }; + let scaled = _mm256_mul_ps(_mm256_sub_ps(v, min), factor); + let rounded = _mm256_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(scaled); + _mm256_cvtps_epi32(rounded) + } + + pub(super) fn quantize_u8_avx2_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX2 was detected. + unsafe { quantize_u8_avx2(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx2")] + unsafe fn quantize_u8_avx2( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm256_set1_ps(qmin); + let factor_v = _mm256_set1_ps(factor); + // The 32->16 and 16->8 packs interleave the two 128-bit lanes; this + // permutation of 32-bit groups restores natural order. + let restore = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + let full = values.len() - values.len() % 32; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(32) { + // SAFETY: `i + 32 <= values.len() == out.len()`. + unsafe { + let q0 = quantize8_epi32(src.add(i), min, factor_v); + let q1 = quantize8_epi32(src.add(i + 8), min, factor_v); + let q2 = quantize8_epi32(src.add(i + 16), min, factor_v); + let q3 = quantize8_epi32(src.add(i + 24), min, factor_v); + // Unsigned-saturating i32 -> u16 -> u8 narrows: lanes are in + // [0, 255] plus float epsilon, which saturation clips. + let lo = _mm256_packus_epi32(q0, q1); + let hi = _mm256_packus_epi32(q2, q3); + let bytes = _mm256_permutevar8x32_epi32(_mm256_packus_epi16(lo, hi), restore); + _mm256_storeu_si256(dst.add(i).cast(), bytes); + } + } + quantize_u8_scalar(&values[full..], qmin, factor, &mut out[full..]); + } + + pub(super) fn quantize_u16_avx2_dispatch( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + // SAFETY: only selected when AVX2 was detected. + unsafe { quantize_u16_avx2(values, qmin, factor, out) } + } + + #[target_feature(enable = "avx2")] + unsafe fn quantize_u16_avx2( + values: &[f32], + qmin: f32, + factor: f32, + out: &mut [MaybeUninit], + ) { + debug_assert_eq!(values.len(), out.len()); + let min = _mm256_set1_ps(qmin); + let factor_v = _mm256_set1_ps(factor); + let full = values.len() - values.len() % 16; + let src = values.as_ptr(); + let dst = out.as_mut_ptr().cast::(); + for i in (0..full).step_by(16) { + // SAFETY: `i + 16 <= values.len() == out.len()`. + unsafe { + let q0 = quantize8_epi32(src.add(i), min, factor_v); + let q1 = quantize8_epi32(src.add(i + 8), min, factor_v); + // The pack interleaves the 128-bit lanes as + // [q0_lo, q1_lo, q0_hi, q1_hi]; the 64-bit-lane permute + // restores [q0_lo, q0_hi, q1_lo, q1_hi]. + let packed = _mm256_packus_epi32(q0, q1); + let words = _mm256_permute4x64_epi64::<0b11_01_10_00>(packed); + _mm256_storeu_si256(dst.add(i).cast(), words); + } + } + quantize_u16_scalar(&values[full..], qmin, factor, &mut out[full..]); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use rstest::rstest; + + /// Straightforward scalar reference implementing the documented + /// semantics: `total_cmp` min/max plus nearest-even rounding. + fn reference_min_max(values: &[f32]) -> (f32, f32) { + let min = values + .iter() + .cloned() + .min_by(|a, b| a.total_cmp(b)) + .unwrap(); + let max = values + .iter() + .cloned() + .max_by(|a, b| a.total_cmp(b)) + .unwrap(); + (min, max) + } + + fn reference_u8(values: &[f32]) -> (DistTableDequant, Vec) { + let (qmin, qmax) = reference_min_max(values); + if dequant_overflows(values.len(), qmin, qmax) { + return (DistTableDequant::Exact, vec![0; values.len()]); + } + let factor = u8::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + return ( + DistTableDequant::Affine { qmin, qmax }, + vec![0; values.len()], + ); + } + let quantized = values + .iter() + .map(|&d| ((d - qmin) * factor).round_ties_even() as u8) + .collect(); + (DistTableDequant::Affine { qmin, qmax }, quantized) + } + + fn reference_u16(values: &[f32]) -> (DistTableDequant, Vec) { + let (qmin, qmax) = reference_min_max(values); + if dequant_overflows(values.len(), qmin, qmax) { + return (DistTableDequant::Exact, vec![0; values.len()]); + } + let factor = u16::MAX as f32 / (qmax - qmin); + if !factor.is_finite() { + return ( + DistTableDequant::Affine { qmin, qmax }, + vec![0; values.len()], + ); + } + let quantized = values + .iter() + .map(|&d| ((d - qmin) * factor).round_ties_even() as u16) + .collect(); + (DistTableDequant::Affine { qmin, qmax }, quantized) + } + + fn available_kernels() -> Vec<(&'static str, MinMaxFn, QuantizeU8Fn, QuantizeU16Fn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![( + "scalar", + min_max_fold as MinMaxFn, + quantize_u8_scalar as QuantizeU8Fn, + quantize_u16_scalar as QuantizeU16Fn, + )]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") { + kernels.push(( + "avx2", + x86::min_max_avx2_dispatch, + x86::quantize_u8_avx2_dispatch, + x86::quantize_u16_avx2_dispatch, + )); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(( + "avx512", + x86::min_max_avx512_dispatch, + x86::quantize_u8_avx512_dispatch, + x86::quantize_u16_avx512_dispatch, + )); + } + } + kernels + } + + /// Every available kernel must agree bit-exactly with the reference on + /// the given input. + fn check_against_reference(values: &[f32]) { + let (expected_dequant_u8, expected_u8) = reference_u8(values); + let (expected_dequant_u16, expected_u16) = reference_u16(values); + let (expected_min, expected_max) = reference_min_max(values); + + for (name, min_max_fn, quantize_u8_fn, quantize_u16_fn) in available_kernels() { + let (qmin, qmax) = min_max_fn(values); + assert_eq!( + (qmin, qmax), + (expected_min, expected_max), + "kernel={name} len={}", + values.len() + ); + + // The quantize kernels are only invoked on the populated path, so + // mirror that guard before exercising them directly. + let overflows = dequant_overflows(values.len(), qmin, qmax); + let factor_u8 = u8::MAX as f32 / (qmax - qmin); + if !overflows && factor_u8.is_finite() { + let mut out_u8 = Vec::with_capacity(values.len()); + quantize_u8_fn( + values, + qmin, + factor_u8, + &mut out_u8.spare_capacity_mut()[..values.len()], + ); + // SAFETY: the kernel initialized every element. + unsafe { out_u8.set_len(values.len()) }; + assert_eq!(out_u8, expected_u8, "kernel={name} len={}", values.len()); + } + + let factor_u16 = u16::MAX as f32 / (qmax - qmin); + if !overflows && factor_u16.is_finite() { + let mut out_u16 = Vec::with_capacity(values.len()); + quantize_u16_fn( + values, + qmin, + factor_u16, + &mut out_u16.spare_capacity_mut()[..values.len()], + ); + // SAFETY: the kernel initialized every element. + unsafe { out_u16.set_len(values.len()) }; + assert_eq!(out_u16, expected_u16, "kernel={name} len={}", values.len()); + } + } + + // The public entry points exercise the dispatched kernels, the + // dequantization classification, and the scratch-buffer handling. + let mut out_u8 = Vec::new(); + assert_eq!( + quantize_dist_table_into(values, &mut out_u8), + expected_dequant_u8, + "len={}", + values.len() + ); + assert_eq!(out_u8, expected_u8, "len={}", values.len()); + let mut out_u16 = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(values, &mut out_u16), + expected_dequant_u16, + "len={}", + values.len() + ); + assert_eq!(out_u16, expected_u16, "len={}", values.len()); + } + + #[rstest] + fn test_quantize_matches_reference( + #[values(1, 2, 15, 16, 17, 31, 32, 33, 63, 64, 100, 6144, 6160)] len: usize, + #[values(1.0, 1e-3, 1e4)] scale: f32, + ) { + let mut rng = SmallRng::seed_from_u64(42 + len as u64); + let values = (0..len) + .map(|_| rng.random_range(-scale..scale)) + .collect::>(); + check_against_reference(&values); + } + + /// Integer tables with range 510 (resp. 131070) make `factor` exactly + /// 0.5, so odd values land on exact .5 ties; all kernels must round them + /// to even and agree with each other. + #[test] + fn test_exact_half_ties_round_to_even() { + let values = (0..=510).map(|v| v as f32).collect::>(); + check_against_reference(&values); + let mut quantized = Vec::new(); + assert_eq!( + quantize_dist_table_into(&values, &mut quantized), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 510.0 + } + ); + // Spot-check nearest-even: 0.5 -> 0, 1.5 -> 2, 127.5 -> 128, + // 254.5 -> 254. + assert_eq!(&quantized[..6], &[0, 0, 1, 2, 2, 2]); + assert_eq!(quantized[255], 128); + assert_eq!(quantized[509], 254); + assert_eq!(quantized[510], 255); + + // Integers up to 131070 are exactly representable in f32. + let values = (0..=510).map(|v| (v * 257) as f32).collect::>(); + check_against_reference(&values); + let mut quantized = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(&values, &mut quantized), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 131070.0 + } + ); + // value * 0.5 = 128.5 -> 128, 385.5 -> 386 under nearest-even. + assert_eq!(&quantized[..4], &[0, 128, 257, 386]); + assert_eq!(quantized[510], u16::MAX); + } + + #[test] + fn test_negative_and_mixed_sign_values() { + let mut rng = SmallRng::seed_from_u64(7); + let values = (0..1000) + .map(|_| rng.random_range(-100.0f32..-1.0)) + .collect::>(); + check_against_reference(&values); + let values = (0..999) + .map(|i| (i as f32 - 499.5) * 0.75) + .collect::>(); + check_against_reference(&values); + } + + #[rstest] + fn test_all_equal_input_zeroes_table(#[values(0.0, -7.25, 3.5)] value: f32) { + let values = vec![value; 100]; + check_against_reference(&values); + // Zero range: a zeroed LUT plus the finite affine map (every sum maps + // to `num_tables * value`). + let expected = DistTableDequant::Affine { + qmin: value, + qmax: value, + }; + let mut quantized = vec![1u8; 5]; + assert_eq!(quantize_dist_table_into(&values, &mut quantized), expected); + assert_eq!(quantized, vec![0; 100]); + let mut quantized = vec![1u16; 5]; + assert_eq!( + quantize_dist_table_u16_into(&values, &mut quantized), + expected + ); + assert_eq!(quantized, vec![0; 100]); + } + + /// A finite sub-resolution range zeroes the LUT but still dequantizes + /// with the finite affine map (`Affine`), whereas a range whose + /// `num_tables`-scaled reconstruction overflows must signal `Exact` so the + /// caller computes exact distances instead of `0 * inf = NaN`. + #[test] + fn test_degenerate_range_classification() { + // factor = 255 / 1e-38 overflows to +inf, but the reconstruction + // (num_tables * {0, 1e-38}) stays finite -> Affine, zeroed LUT. + let mut tiny_range = vec![0.0f32; 32]; + tiny_range[1] = 1e-38; + // num_tables * (2e38 - (-2e38)) overflows f32 -> Exact. + let mut huge_range = vec![0.0f32; 32]; + huge_range[0] = -2e38; + huge_range[1] = 2e38; + // factor = 65535 / 1e-35 overflows only in the u16 variant; the u8 + // variant still quantizes normally. + let mut u16_only = vec![0.0f32; 32]; + u16_only[1] = 1e-35; + + for values in [&tiny_range, &huge_range, &u16_only] { + check_against_reference(values); + } + let mut quantized_u8 = Vec::new(); + assert_eq!( + quantize_dist_table_into(&tiny_range, &mut quantized_u8), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-38 + } + ); + assert_eq!(quantized_u8, vec![0; 32]); + assert_eq!( + quantize_dist_table_into(&huge_range, &mut quantized_u8), + DistTableDequant::Exact + ); + assert_eq!(quantized_u8, vec![0; 32]); + let mut quantized_u16 = Vec::new(); + assert_eq!( + quantize_dist_table_u16_into(&u16_only, &mut quantized_u16), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-35 + } + ); + assert_eq!(quantized_u16, vec![0; 32]); + assert_eq!( + quantize_dist_table_into(&u16_only, &mut quantized_u8), + DistTableDequant::Affine { + qmin: 0.0, + qmax: 1e-35 + } + ); + assert_eq!(quantized_u8[1], u8::MAX); + } + + /// `-0.0 == 0.0` must keep taking the zero-range path (zeroed LUT, + /// `Affine`) even though SIMD min/max may pick either sign for the + /// extremes. + #[test] + fn test_signed_zero_mix_zeroes_table() { + let mut values = vec![0.0f32; 64]; + values.iter_mut().step_by(2).for_each(|v| *v = -0.0); + let mut quantized = Vec::new(); + match quantize_dist_table_into(&values, &mut quantized) { + DistTableDequant::Affine { qmin, qmax } => assert_eq!(qmin, qmax), + other => panic!("expected Affine, got {other:?}"), + } + assert_eq!(quantized, vec![0; 64]); + } + + /// Every quantizer — scalar, AVX2, AVX-512, including the SIMD kernels' + /// scalar tails — must round with fixed nearest-even, independent of the + /// dynamic MXCSR rounding mode. Run each with MXCSR forced to + /// round-toward-zero and require it still matches the nearest-even + /// reference (computed under the default mode). `factor == 0.5` puts odd + /// integers on exact .5 ties, where truncation (1.5 -> 1) and nearest-even + /// (1.5 -> 2) disagree, so a path that honored MXCSR would fail. The + /// length (511) is deliberately not a multiple of the SIMD step so the + /// kernels' scalar tails are exercised too. + #[cfg(target_arch = "x86_64")] + #[test] + #[allow(deprecated)] // _mm_getcsr/_mm_setcsr: no stable non-asm replacement. + fn test_quantize_rounding_ignores_mxcsr() { + use std::arch::x86_64::{_MM_ROUND_MASK, _MM_ROUND_TOWARD_ZERO, _mm_getcsr, _mm_setcsr}; + + let values = (0..=510).map(|v| v as f32).collect::>(); + // Computed under the default (nearest-even) rounding mode. + let (_, expected_u8) = reference_u8(&values); + let (_, expected_u16) = reference_u16(&values); + let factor_u8 = u8::MAX as f32 / 510.0; + let factor_u16 = u16::MAX as f32 / 510.0; + + for (name, _, quantize_u8_fn, quantize_u16_fn) in available_kernels() { + let mut out_u8 = Vec::with_capacity(values.len()); + let mut out_u16 = Vec::with_capacity(values.len()); + // SAFETY: SSE is baseline on x86_64. MXCSR is restored before any + // assertion so a failure cannot leak the truncating mode. + let saved = unsafe { _mm_getcsr() }; + unsafe { + _mm_setcsr((saved & !_MM_ROUND_MASK) | _MM_ROUND_TOWARD_ZERO); + quantize_u8_fn( + &values, + 0.0, + factor_u8, + &mut out_u8.spare_capacity_mut()[..values.len()], + ); + quantize_u16_fn( + &values, + 0.0, + factor_u16, + &mut out_u16.spare_capacity_mut()[..values.len()], + ); + _mm_setcsr(saved); + out_u8.set_len(values.len()); + out_u16.set_len(values.len()); + } + assert_eq!(out_u8, expected_u8, "kernel={name} under truncating MXCSR"); + assert_eq!( + out_u16, expected_u16, + "kernel={name} under truncating MXCSR" + ); + } + } + + /// The scratch buffer must be fully overwritten across reuses with + /// different lengths. + #[test] + fn test_scratch_buffer_reuse() { + let mut rng = SmallRng::seed_from_u64(11); + let mut scratch_u8 = vec![7u8; 500]; + let mut scratch_u16 = vec![7u16; 500]; + for len in [48, 512, 16] { + let values = (0..len) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(); + quantize_dist_table_into(&values, &mut scratch_u8); + assert_eq!(scratch_u8, reference_u8(&values).1); + quantize_dist_table_u16_into(&values, &mut scratch_u16); + assert_eq!(scratch_u16, reference_u16(&values).1); + } + } +} diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index 36e56986921..ef88500a7b7 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -41,6 +41,9 @@ use serde::{Deserialize, Serialize}; use crate::frag_reuse::FragReuseIndex; use crate::pb; use crate::vector::ApproxMode; +use crate::vector::bq::dist_table_quant::{ + DistTableDequant, quantize_dist_table_into, quantize_dist_table_u16_into, +}; use crate::vector::bq::ex_dot::{ EX_DOT_BLOCK_DIMS, ExDotFn, blocked_ex_code_bytes, ex_dot_kernel, pad_query_into, padded_query_len, repack_sequential_row, sequential_matches_blocked, @@ -897,6 +900,22 @@ impl<'a> RabitDistCalculator<'a> { ) } + /// Fill `dists[0..n]` with exact per-row binary distances computed + /// directly from the f32 dist table — the fallback when the quantized + /// reconstruction scale would be non-finite ([`DistTableDequant::Exact`]). + #[allow(clippy::uninit_vec)] + fn fill_exact_binary_distances(&self, n: usize, code_len: usize, dists: &mut Vec) { + dists.clear(); + dists.reserve(n); + // SAFETY: the loop initializes every element in [0, n). + unsafe { + dists.set_len(n); + } + dists.iter_mut().enumerate().for_each(|(id, dist)| { + *dist = compute_single_rq_distance(self.codes, id, n, code_len, &self.dist_table); + }); + } + #[allow(clippy::uninit_vec)] fn binary_distances_with_scratch( &self, @@ -918,7 +937,16 @@ impl<'a> RabitDistCalculator<'a> { ); } - let (qmin, qmax) = quantize_dist_table_into(&self.dist_table, quantized_dists_table); + let (qmin, qmax) = match quantize_dist_table_into(&self.dist_table, quantized_dists_table) { + DistTableDequant::Affine { qmin, qmax } => (qmin, qmax), + DistTableDequant::Exact => { + // The affine reconstruction would be non-finite; compute every + // binary distance exactly and report no SIMD rows so the + // ex-rerank caller takes the per-row path for all of them. + self.fill_exact_binary_distances(n, code_len, dists); + return 0; + } + }; let remainder = n % BATCH_SIZE; let simd_len = n - remainder; quantized_dists.clear(); @@ -978,7 +1006,16 @@ impl<'a> RabitDistCalculator<'a> { hacc_dist_table: &mut Vec, quantized_dists: &mut Vec, ) -> usize { - let (qmin, qmax) = quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table); + let (qmin, qmax) = + match quantize_dist_table_u16_into(&self.dist_table, quantized_dist_table) { + DistTableDequant::Affine { qmin, qmax } => (qmin, qmax), + DistTableDequant::Exact => { + // See binary_distances_with_scratch: non-finite affine + // scale falls back to exact per-row distances. + self.fill_exact_binary_distances(n, code_len, dists); + return 0; + } + }; simd::dist_table::transfer_4bit_dist_table_u16(quantized_dist_table, hacc_dist_table); let remainder = n % BATCH_SIZE; let simd_len = n - remainder; @@ -1345,68 +1382,6 @@ where }) } -// Quantize the distance table into a caller-owned buffer. -#[inline] -fn quantize_dist_table_into(dist_table: &[f32], quantized_dist_table: &mut Vec) -> (f32, f32) { - let (qmin, qmax) = dist_table - .iter() - .cloned() - .minmax_by(|a, b| a.total_cmp(b)) - .into_option() - .unwrap(); - // this happens if the query is all zeros - if qmin == qmax { - quantized_dist_table.clear(); - quantized_dist_table.resize(dist_table.len(), 0); - return (qmin, qmax); - } - let factor = 255.0 / (qmax - qmin); - quantized_dist_table.clear(); - quantized_dist_table.reserve(dist_table.len()); - let spare = quantized_dist_table.spare_capacity_mut(); - for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) { - quantized.write(((d - qmin) * factor).round() as u8); - } - // SAFETY: every element in the reserved range was initialized in the loop above. - unsafe { - quantized_dist_table.set_len(dist_table.len()); - } - - (qmin, qmax) -} - -#[inline] -fn quantize_dist_table_u16_into( - dist_table: &[f32], - quantized_dist_table: &mut Vec, -) -> (f32, f32) { - let (qmin, qmax) = dist_table - .iter() - .cloned() - .minmax_by(|a, b| a.total_cmp(b)) - .into_option() - .unwrap(); - if qmin == qmax { - quantized_dist_table.clear(); - quantized_dist_table.resize(dist_table.len(), 0); - return (qmin, qmax); - } - - let factor = u16::MAX as f32 / (qmax - qmin); - quantized_dist_table.clear(); - quantized_dist_table.reserve(dist_table.len()); - let spare = quantized_dist_table.spare_capacity_mut(); - for (quantized, &d) in spare[..dist_table.len()].iter_mut().zip(dist_table.iter()) { - quantized.write(((d - qmin) * factor).round() as u16); - } - // SAFETY: every element in the reserved range was initialized in the loop above. - unsafe { - quantized_dist_table.set_len(dist_table.len()); - } - - (qmin, qmax) -} - /// Build the u8 FastScan LUT for the ex codes directly from the rotated /// query (`ex_query`, natural dim order, padding dims zero): the underlying /// per-dim table is the pure multiplication `q[d] * code`, so no intermediate @@ -2551,6 +2526,7 @@ mod tests { use arrow_array::{ArrayRef, Float32Array, Float64Array, UInt64Array}; use lance_core::ROW_ID; use lance_linalg::distance::DistanceType; + use rstest::rstest; use crate::vector::bq::{RQRotationType, builder::RabitQuantizer}; use crate::vector::quantizer::{Quantization, QuantizerStorage}; @@ -3547,6 +3523,97 @@ mod tests { } } + /// A dist table whose `num_tables`-scaled reconstruction overflows `f32` + /// must fall back to exact distances rather than the affine dequant's + /// `0 * inf = NaN`. Covers both the u8 (Normal) and u16 (Accurate) LUT + /// paths end-to-end through `distance_all`, asserting the result is + /// NaN-free and bit-identical to the always-exact per-row computation. + #[rstest] + fn test_degenerate_dist_table_falls_back_to_exact_distances( + #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode, + ) { + let code_dim = 8usize; + let num_rows = BATCH_SIZE + 5; + let num_bits = 3; + let ex_bits = rabit_ex_bits(num_bits).unwrap(); + let identity = Float32Array::from_iter_values( + (0..code_dim) + .flat_map(|row| (0..code_dim).map(move |col| if row == col { 1.0 } else { 0.0 })), + ); + let rotate_mat = + FixedSizeListArray::try_new_from_values(identity, code_dim as i32).unwrap(); + let metadata = RabitQuantizationMetadata { + rotate_mat: Some(rotate_mat), + rotate_mat_position: None, + fast_rotation_signs: None, + rotation_type: RQRotationType::Matrix, + code_dim: code_dim as u32, + num_bits, + packed: false, + query_estimator: RabitQueryEstimator::RawQuery, + }; + let codes = FixedSizeListArray::try_new_from_values( + UInt8Array::from_iter_values((0..num_rows).map(|idx| (idx * 19) as u8)), + rabit_binary_code_bytes(code_dim) as i32, + ) + .unwrap(); + let ex_codes = make_test_ex_codes(num_rows, code_dim, num_bits); + let batch = make_test_batch_with_ex(codes, ex_codes); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None) + .unwrap(); + let query = Arc::new(Float32Array::from(vec![1.0; code_dim])) as ArrayRef; + + let mut calc = storage.dist_calculator(query, 4.0); + calc.approx_mode = approx_mode; + // num_tables = (code_dim * 4) / SEGMENT_NUM_CODES = 2; the extrema sum + // (qmax - qmin = 4e38) overflows when scaled by num_tables, so the + // quantizer returns `Exact`. Per-row sums stay finite (each row reads + // one entry per segment), so the exact path is well-defined. + let mut degenerate = vec![0.0f32; code_dim * 4]; + degenerate[0] = -2e38; + degenerate[1] = 2e38; + calc.dist_table = Cow::Owned(degenerate); + + let code_len = rabit_binary_code_bytes(code_dim); + let ex_codes = calc.ex_codes.unwrap(); + let ex_add_factors = calc.ex_add_factors.unwrap(); + let ex_scale_factors = calc.ex_scale_factors.unwrap(); + let expected = (0..num_rows) + .map(|id| { + let binary_ip = compute_single_rq_distance( + calc.codes, + id, + num_rows, + code_len, + &calc.dist_table, + ); + calc.raw_query_multi_bit_exact_distance( + id, + binary_ip, + ex_bits, + ex_codes, + ex_add_factors, + ex_scale_factors, + ) + }) + .collect::>(); + + let actual = calc.distance_all(0); + assert_eq!(actual.len(), num_rows); + for id in 0..num_rows { + assert!( + !actual[id].is_nan(), + "approx_mode={approx_mode:?} id={id}: degenerate table produced NaN" + ); + assert_eq!( + actual[id].to_bits(), + expected[id].to_bits(), + "approx_mode={approx_mode:?} id={id}: distance_all must match the exact path" + ); + } + } + #[test] fn test_raw_query_multi_bit_accumulate_topk_uses_lower_bound_gating() { let code_dim = 8usize; From 924e4a4ebaef8e0e0865b5cfbf8fbd712fc717df Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Mon, 15 Jun 2026 18:33:41 +0800 Subject: [PATCH 100/177] perf(index): set ivf_rq target partition size to 4096 (#7273) ## Performance Improvement ### What is the performance issue or bottleneck? IVF_RQ currently falls through to the generic vector index target partition size of 8192. Recent IVF_RQ tuning indicates a smaller default partition target is preferred for this index type. ### How does this PR improve performance? This PR gives `IndexType::IvfRq` an explicit `target_partition_size()` default of 4096. IVF_RQ index creation paths that do not set `target_partition_size` will now derive the number of IVF partitions from 4096 instead of the generic fallback. It also adds a focused unit test so the IVF_RQ default cannot silently fall back to 8192 again. ### Benchmark or measurement results No new benchmark run in this PR. This only changes the configured default and protects it with a unit test. ## Verification - `cargo fmt --all` - `cargo test -p lance-index test_ivf_rq_target_partition_size` - `cargo clippy --all --tests --benches -- -D warnings` --- rust/lance-index/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 888070a3c1f..c7cace92428 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -312,6 +312,7 @@ impl IndexType { Self::IvfFlat => 4096, Self::IvfSq => 8192, Self::IvfPq => 8192, + Self::IvfRq => 4096, Self::IvfHnswFlat => 1 << 20, Self::IvfHnswSq => 1 << 20, Self::IvfHnswPq => 1 << 20, @@ -382,6 +383,11 @@ mod tests { assert_eq!(IndexType::max_vector_version(), IVF_RQ_INDEX_VERSION); } + #[test] + fn test_ivf_rq_target_partition_size() { + assert_eq!(IndexType::IvfRq.target_partition_size(), 4096); + } + #[test] fn test_index_type_try_from_i32_covers_all_variants() { let all = [ From c188de59fcf0976a0a9fef53ae67ae7ae8bcb61a Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Mon, 15 Jun 2026 15:42:39 +0000 Subject: [PATCH 101/177] chore: release beta version 8.0.0-beta.14 --- .bumpversion.toml | 2 +- Cargo.lock | 100 ++++++++-------- Cargo.toml | 44 +++---- java/lance-jni/Cargo.lock | 244 +++++++------------------------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 92 +++++++------- python/Cargo.toml | 2 +- 8 files changed, 165 insertions(+), 323 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 32cca52aa9a..91adc6c59c1 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.13" +current_version = "8.0.0-beta.14" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 59ca17eec12..1e820b0c63d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3462,9 +3462,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -4331,9 +4331,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -4388,7 +4388,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "all_asserts", "approx", @@ -4491,7 +4491,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4539,7 +4539,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrayref", "paste", @@ -4548,7 +4548,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4588,7 +4588,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4621,7 +4621,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "proc-macro2", "quote", @@ -4649,7 +4649,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4694,7 +4694,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "all_asserts", "arrow", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4759,7 +4759,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "datafusion", "geo-traits", @@ -4773,7 +4773,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "approx", "arc-swap", @@ -4849,7 +4849,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-arith", @@ -4897,7 +4897,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "approx", "arrow-array", @@ -4916,7 +4916,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "async-trait", @@ -4928,7 +4928,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-schema", @@ -4944,7 +4944,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4994,9 +4994,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -5008,7 +5008,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -5026,7 +5026,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -5072,7 +5072,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "proc-macro2", "quote", @@ -5081,7 +5081,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-schema", @@ -5094,7 +5094,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5106,7 +5106,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "clap", "lance-core", @@ -6226,9 +6226,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.80" +version = "0.10.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +checksum = "77823a27f0babb03091cb9ed9ef80af3b39dbc82f97e8fa530374b7dafd87a45" dependencies = [ "bitflags 2.13.0", "cfg-if 1.0.4", @@ -6257,9 +6257,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "openssl-sys" -version = "0.9.116" +version = "0.9.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +checksum = "b47e7e6bb2c38cd930d25a23b40fa52e068c10e85f3e03a7f5ba5aaca5713695" dependencies = [ "cc", "libc", @@ -9484,9 +9484,9 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ "wit-bindgen 0.57.1", ] @@ -9511,9 +9511,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -9524,9 +9524,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -9534,9 +9534,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -9544,9 +9544,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", @@ -9557,9 +9557,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] @@ -9626,9 +9626,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -10398,9 +10398,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" diff --git a/Cargo.toml b/Cargo.toml index 3b57ab3498e..68a9002872b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.13", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.13", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.13", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.13", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.13", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.13", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.13", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.13", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.13", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.13", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.13", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.13", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.13", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.13", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.14", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.14", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.14", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.14", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.14", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.14", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.14", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.14", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.14", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.14", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.14", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.14", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.4" -lance-select = { version = "=8.0.0-beta.13", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.13", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.13", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.13", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.13", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.14", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.14", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.14", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.14", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.14", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.13", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.14", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.13", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.14", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 5733f730fc0..ea9a0c0848f 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -42,21 +42,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - [[package]] name = "allocator-api2" version = "0.2.21" @@ -1004,27 +989,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "brotli" -version = "8.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - [[package]] name = "bs58" version = "0.5.1" @@ -1610,7 +1574,6 @@ dependencies = [ "datafusion-datasource-arrow", "datafusion-datasource-csv", "datafusion-datasource-json", - "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1632,7 +1595,6 @@ dependencies = [ "log", "object_store", "parking_lot", - "parquet", "rand 0.9.4", "regex", "sqlparser", @@ -1707,7 +1669,6 @@ dependencies = [ "libc", "log", "object_store", - "parquet", "paste", "sqlparser", "tokio", @@ -1825,36 +1786,6 @@ dependencies = [ "tokio-stream", ] -[[package]] -name = "datafusion-datasource-parquet" -version = "53.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr", - "datafusion-physical-expr-adapter", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "parquet", - "tokio", -] - [[package]] name = "datafusion-doc" version = "53.1.0" @@ -2505,7 +2436,6 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", - "zlib-rs", ] [[package]] @@ -2549,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2921,9 +2851,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -3510,12 +3440,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - [[package]] name = "io-uring" version = "0.7.12" @@ -3701,9 +3625,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -3723,7 +3647,7 @@ dependencies = [ "jiff", "nom", "num-traits", - "ordered-float 5.3.0", + "ordered-float", "rand 0.9.4", "serde", "serde_json", @@ -3749,7 +3673,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arc-swap", "arrow", @@ -3822,7 +3746,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -3864,7 +3788,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrayref", "paste", @@ -3873,7 +3797,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -3911,7 +3835,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -3943,7 +3867,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -3956,12 +3880,11 @@ dependencies = [ "rand 0.9.4", "rand_distr", "rand_xoshiro", - "random_word", ] [[package]] name = "lance-derive" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "proc-macro2", "quote", @@ -3970,7 +3893,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4005,7 +3928,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4035,7 +3958,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "datafusion", "geo-traits", @@ -4049,7 +3972,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arc-swap", "arrow", @@ -4116,7 +4039,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-arith", @@ -4157,7 +4080,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4193,7 +4116,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4131,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "async-trait", @@ -4220,7 +4143,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-ipc", @@ -4255,9 +4178,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4269,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4284,7 +4207,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4321,7 +4244,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -5178,15 +5101,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - [[package]] name = "ordered-float" version = "5.3.0" @@ -5250,42 +5164,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "58.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.17.1", - "lz4_flex", - "num-bigint", - "num-integer", - "num-traits", - "object_store", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - [[package]] name = "paste" version = "1.0.15" @@ -5805,19 +5683,6 @@ dependencies = [ "rand_core 0.9.5", ] -[[package]] -name = "random_word" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" -dependencies = [ - "ahash", - "brotli", - "paste", - "rand 0.9.4", - "unicase", -] - [[package]] name = "rangemap" version = "1.7.1" @@ -6822,12 +6687,6 @@ dependencies = [ "syn", ] -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - [[package]] name = "socket2" version = "0.6.4" @@ -7135,17 +6994,6 @@ dependencies = [ "cfg-if 1.0.4", ] -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float 2.10.1", -] - [[package]] name = "time" version = "0.3.47" @@ -7741,9 +7589,9 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ "wit-bindgen 0.57.1", ] @@ -7768,9 +7616,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -7781,9 +7629,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -7791,9 +7639,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7801,9 +7649,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", @@ -7814,9 +7662,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] @@ -7883,9 +7731,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -8674,9 +8522,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" @@ -8713,12 +8561,6 @@ dependencies = [ "syn", ] -[[package]] -name = "zlib-rs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" - [[package]] name = "zmij" version = "1.0.21" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index eee207d912e..deb97874e82 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 9a88cf039b5..61bf90facf2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.13 + 8.0.0-beta.14 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index c8f484c73d0..5dfc1ba47e0 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1121,9 +1121,9 @@ dependencies = [ [[package]] name = "brotli" -version = "8.0.3" +version = "8.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8119e4516436f5708bbc474a9d395bf12f1b5395e93a92a56e647ac3388c8610" +checksum = "5cc91aac060a7a1e25823bdccbfb6af1875b88f17c6daac97894eed8207166b3" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1132,9 +1132,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "5.0.1" +version = "5.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5962523e1b92ce1b5e793d9169b9943eece10d39f62550bc04bb605d75b94924" +checksum = "3a32acac15fe1967bc3986b2a6347dffc965602354ea6f450ad07e8bfd253583" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3240,9 +3240,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +checksum = "6cb093c84e8bd9b188d4c4a8cb6579fc016968d14c99882163cd3ff402a4f155" dependencies = [ "atomic-waker", "bytes", @@ -4018,9 +4018,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2025f20d7a4fa7785846e7b63d10a76d3f1cee98ee5cb79ea59703f95e42162" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" dependencies = [ "cfg-if 1.0.4", "futures-util", @@ -4075,7 +4075,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arc-swap", "arrow", @@ -4149,7 +4149,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4191,7 +4191,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrayref", "paste", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4270,7 +4270,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4287,7 +4287,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "proc-macro2", "quote", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4331,7 +4331,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-arith", "arrow-array", @@ -4361,7 +4361,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "datafusion", "geo-traits", @@ -4375,7 +4375,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arc-swap", "arrow", @@ -4443,7 +4443,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-arith", @@ -4484,7 +4484,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4499,7 +4499,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "async-trait", @@ -4511,7 +4511,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-ipc", @@ -4546,9 +4546,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d287494559c22838ce34e51ea0fa29dc780d5be8283de5ab33e9395623000c8" +checksum = "ba3f0a235e3ed5f8805205649ccc7d7d0f3df23ce1294242c9265ad488d7f19d" dependencies = [ "reqwest 0.12.28", "serde", @@ -4560,7 +4560,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow-array", "arrow-buffer", @@ -4575,7 +4575,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "arrow", "arrow-array", @@ -4614,7 +4614,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6057,7 +6057,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" dependencies = [ "alloc-stdlib", "arrow", @@ -8440,9 +8440,9 @@ dependencies = [ [[package]] name = "wasip2" -version = "1.0.3+wasi-0.2.9" +version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ "wit-bindgen 0.57.1", ] @@ -8467,9 +8467,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a254a4b10c19a76f09a27640e7ffbf9bc30bf67e16a3bf28aaefa4920fe81563" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" dependencies = [ "cfg-if 1.0.4", "once_cell", @@ -8480,9 +8480,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.73" +version = "0.4.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54568702fabf5d4849ce2b90fadfa64168a097eaf4b351ce9df8b687a0086aaf" +checksum = "503b14d284f2c8dac03b819967e155ea753f573586193b2b2c95990cb5d69280" dependencies = [ "js-sys", "wasm-bindgen", @@ -8490,9 +8490,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a40fc75b0ec6f3746ceb10d36f53a93dcd68a93b11b6445983945d79eba0dc" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -8500,9 +8500,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "908f34bd9b9ce3d4caf07b72dfab63d61504d156856c6bd3cd87fa350cf3985b" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" dependencies = [ "bumpalo", "proc-macro2", @@ -8513,9 +8513,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.123" +version = "0.2.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7acbf7616c27b194bbb550bf77ed0c2c3e5b7fd1260a93082b95fb7f47959b92" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" dependencies = [ "unicode-ident", ] @@ -8582,9 +8582,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.100" +version = "0.3.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0871acf327f283dc6da28a1696cdc64fb355ba9f935d052021fa77f35cce69" +checksum = "a6430a72df5eb332242960fe84b3002a241163998241eb596d4f739b9757061d" dependencies = [ "js-sys", "wasm-bindgen", @@ -9307,9 +9307,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.8.2" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" [[package]] name = "zerotrie" diff --git a/python/Cargo.toml b/python/Cargo.toml index 82f5036a62b..f6a3c67381d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.13" +version = "8.0.0-beta.14" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From f405b34f7cc07e6bffa4283ba41bff20a8407c02 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 16 Jun 2026 01:05:39 +0800 Subject: [PATCH 102/177] feat: configure blob inline threshold per column (#7269) This adds per-column blob v2 inline threshold metadata so callers can choose when a blob column moves from inline data-file storage to packed sidecar storage, without changing the existing packed sidecar rolling option. The threshold is stored on the blob field metadata, matching the existing dedicated blob threshold model. Existing blob columns keep their policy in the dataset schema; appends that explicitly provide different threshold metadata are rejected instead of silently ignoring the input schema. The Python and Rust helpers validate threshold values at the API boundary so invalid values do not silently fall back to defaults. Closes #7268. --- docs/src/guide/blob.md | 10 + python/python/lance/blob.py | 67 +++- python/python/tests/test_blob.py | 154 ++++++++ rust/lance-arrow/src/lib.rs | 2 + rust/lance/src/blob.rs | 99 ++++- rust/lance/src/dataset/blob.rs | 595 +++++++++++++++++++++++++++++-- rust/lance/src/dataset/write.rs | 83 ++++- rust/lance/src/lib.rs | 2 +- 8 files changed, 965 insertions(+), 47 deletions(-) diff --git a/docs/src/guide/blob.md b/docs/src/guide/blob.md index b1f956a19e7..dd13fcaab34 100644 --- a/docs/src/guide/blob.md +++ b/docs/src/guide/blob.md @@ -95,6 +95,16 @@ Note: - By default, external blob URIs must map to a registered non-dataset-root base path. - If you need to reference external objects outside those bases, set `allow_external_blob_outside_bases=True` when writing. +- Blob v2 storage layout thresholds can be configured per column with + `blob_field(..., inline_size_threshold=..., dedicated_size_threshold=...)`. + The inline threshold controls when values move from the data file to packed + `.blob` sidecar storage. The dedicated threshold controls when values move + from packed sidecar storage to a dedicated `.blob` file. The dedicated + threshold is checked first. For existing columns, these thresholds are stored + in the dataset schema; appends that explicitly provide different threshold + metadata for the same column are rejected. +- `blob_pack_file_size_threshold` is a write option for rolling packed `.blob` + sidecar files. It does not control inline-vs-packed placement. ### Example: packed external blobs (single container file) diff --git a/python/python/lance/blob.py b/python/python/lance/blob.py index 46faf760cdd..a87c9302736 100644 --- a/python/python/lance/blob.py +++ b/python/python/lance/blob.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import ctypes import io from dataclasses import dataclass from typing import IO, Any, Iterator, Optional, Union @@ -9,6 +10,12 @@ from .lance import LanceBlobFile +_BLOB_INLINE_SIZE_THRESHOLD_META_KEY = b"lance-encoding:blob-inline-size-threshold" +_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY = ( + b"lance-encoding:blob-dedicated-size-threshold" +) +_MAX_RUST_USIZE = ctypes.c_size_t(-1).value + @dataclass(frozen=True) class Blob: @@ -190,9 +197,63 @@ def blob_array(values: list[Any]) -> BlobArray: return BlobArray.from_pylist(values) -def blob_field(name: str, *, nullable: bool = True) -> pa.Field: - """Construct an Arrow field for a Lance blob column.""" - return pa.field(name, BlobType(), nullable=nullable) +def _validate_threshold(name: str, value: Optional[int], *, allow_zero: bool) -> None: + if value is None: + return + if isinstance(value, bool) or not isinstance(value, int): + raise TypeError(f"{name} must be an int, got {type(value).__name__}") + if allow_zero: + if value < 0: + raise ValueError(f"{name} must be non-negative") + elif value <= 0: + raise ValueError(f"{name} must be positive") + if value > _MAX_RUST_USIZE: + raise OverflowError(f"{name} must fit in a Rust usize") + + +def blob_field( + name: str, + *, + nullable: bool = True, + inline_size_threshold: Optional[int] = None, + dedicated_size_threshold: Optional[int] = None, +) -> pa.Field: + """ + Construct an Arrow field for a Lance blob column. + + Parameters + ---------- + name : str + Field name. + nullable : bool, default True + Whether the blob column accepts null values. + inline_size_threshold : optional, int + Maximum payload size in bytes to keep inline in the data file before + using packed blob storage. + dedicated_size_threshold : optional, int + Maximum payload size in bytes to store in packed blob storage before + using dedicated blob storage. This threshold is checked before + ``inline_size_threshold``. + """ + _validate_threshold("inline_size_threshold", inline_size_threshold, allow_zero=True) + _validate_threshold( + "dedicated_size_threshold", dedicated_size_threshold, allow_zero=False + ) + + field = pa.field(name, BlobType(), nullable=nullable) + if inline_size_threshold is None and dedicated_size_threshold is None: + return field + + metadata = dict(field.metadata or {}) + if inline_size_threshold is not None: + metadata[_BLOB_INLINE_SIZE_THRESHOLD_META_KEY] = str( + inline_size_threshold + ).encode() + if dedicated_size_threshold is not None: + metadata[_BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY] = str( + dedicated_size_threshold + ).encode() + return field.with_metadata(metadata) class BlobIterator: diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py index 9a0ad07f637..cb34ae3cafc 100644 --- a/python/python/tests/test_blob.py +++ b/python/python/tests/test_blob.py @@ -583,6 +583,160 @@ def test_blob_extension_write_inline(tmp_path): assert f.read() == b"foo" +def test_blob_field_threshold_metadata(): + field = lance.blob_field( + "blob", + inline_size_threshold=16 * 1024, + dedicated_size_threshold=2 * 1024 * 1024, + ) + + assert field.metadata[b"lance-encoding:blob-inline-size-threshold"] == b"16384" + assert field.metadata[b"lance-encoding:blob-dedicated-size-threshold"] == b"2097152" + + +@pytest.mark.parametrize( + ("kwargs", "error", "message"), + [ + pytest.param( + {"inline_size_threshold": -1}, + ValueError, + "inline_size_threshold must be non-negative", + id="negative_inline", + ), + pytest.param( + {"dedicated_size_threshold": 0}, + ValueError, + "dedicated_size_threshold must be positive", + id="zero_dedicated", + ), + pytest.param( + {"dedicated_size_threshold": -1}, + ValueError, + "dedicated_size_threshold must be positive", + id="negative_dedicated", + ), + pytest.param( + {"inline_size_threshold": True}, + TypeError, + "inline_size_threshold must be an int", + id="bool_inline", + ), + pytest.param( + {"dedicated_size_threshold": True}, + TypeError, + "dedicated_size_threshold must be an int", + id="bool_dedicated", + ), + pytest.param( + {"inline_size_threshold": 1.5}, + TypeError, + "inline_size_threshold must be an int", + id="float_inline", + ), + pytest.param( + {"inline_size_threshold": 2**100}, + OverflowError, + "inline_size_threshold must fit in a Rust usize", + id="overflow_inline", + ), + pytest.param( + {"dedicated_size_threshold": 2**100}, + OverflowError, + "dedicated_size_threshold must fit in a Rust usize", + id="overflow_dedicated", + ), + ], +) +def test_blob_field_rejects_invalid_thresholds(kwargs, error, message): + with pytest.raises(error, match=message): + lance.blob_field("blob", **kwargs) + + +def test_blob_extension_inline_threshold_per_column(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field("inline_blob", inline_size_threshold=4096), + lance.blob_field("packed_blob", inline_size_threshold=1024), + ] + ) + table = pa.table( + { + "inline_blob": lance.blob_array([payload]), + "packed_blob": lance.blob_array([payload]), + }, + schema=schema, + ) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_inline_threshold_per_column", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["inline_blob", "packed_blob"]) + assert desc.column("inline_blob").chunk(0).field("kind").to_pylist() == [0] + assert desc.column("packed_blob").chunk(0).field("kind").to_pylist() == [1] + + +def test_blob_extension_threshold_metadata_persists_after_reopen(tmp_path): + dataset_path = tmp_path / "test_ds_v2_threshold_metadata_persists" + schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + table = pa.table({"blob": lance.blob_array([b"x"])}, schema=schema) + + lance.write_dataset(table, dataset_path, data_storage_version="2.2") + reopened = lance.dataset(dataset_path) + + assert ( + reopened.schema.field("blob").metadata[ + b"lance-encoding:blob-inline-size-threshold" + ] + == b"1024" + ) + + +def test_blob_extension_append_rejects_explicit_threshold_mismatch(tmp_path): + dataset_path = tmp_path / "test_ds_v2_append_threshold_mismatch" + initial_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=4096)]) + initial = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=initial_schema, + ) + lance.write_dataset(initial, dataset_path, data_storage_version="2.2") + + append_schema = pa.schema([lance.blob_field("blob", inline_size_threshold=1024)]) + append = pa.table( + {"blob": lance.blob_array([b"x" * 2048])}, + schema=append_schema, + ) + + with pytest.raises( + OSError, match="Cannot append data with blob threshold metadata" + ): + lance.write_dataset(append, dataset_path, mode="append") + + +def test_blob_extension_dedicated_threshold_precedes_inline_threshold(tmp_path): + payload = b"x" * 2048 + schema = pa.schema( + [ + lance.blob_field( + "blob", + inline_size_threshold=4096, + dedicated_size_threshold=1024, + ) + ] + ) + table = pa.table({"blob": lance.blob_array([payload])}, schema=schema) + ds = lance.write_dataset( + table, + tmp_path / "test_ds_v2_dedicated_precedes_inline", + data_storage_version="2.2", + ) + + desc = ds.to_table(columns=["blob"]).column("blob").chunk(0) + assert desc.field("kind").to_pylist() == [2] + + def test_blob_extension_write_external(tmp_path): blob_path = tmp_path / "external_blob.bin" blob_path.write_bytes(b"hello") diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index b993cf00745..34a67600543 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -52,6 +52,8 @@ pub const BLOB_V2_EXT_NAME: &str = "lance.blob.v2"; /// Metadata key for overriding the dedicated blob size threshold (in bytes) pub const BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-dedicated-size-threshold"; +/// Metadata key for overriding the inline blob size threshold (in bytes) +pub const BLOB_INLINE_SIZE_THRESHOLD_META_KEY: &str = "lance-encoding:blob-inline-size-threshold"; type Result = std::result::Result; diff --git a/rust/lance/src/blob.rs b/rust/lance/src/blob.rs index 322bf67a04c..58df42b5cd3 100644 --- a/rust/lance/src/blob.rs +++ b/rust/lance/src/blob.rs @@ -7,12 +7,16 @@ //! tagged with `ARROW:extension:name = "lance.blob.v2"`. This module offers a //! type-safe builder to construct that struct without manually wiring metadata +use std::num::NonZeroUsize; use std::sync::Arc; use arrow_array::{ArrayRef, StructArray, builder::LargeBinaryBuilder, builder::StringBuilder}; use arrow_buffer::NullBufferBuilder; use arrow_schema::{DataType, Field}; -use lance_arrow::{ARROW_EXT_NAME_KEY, BLOB_V2_EXT_NAME}; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, +}; use crate::{Error, Result}; @@ -21,9 +25,71 @@ use crate::{Error, Result}; /// Blob v2 expects a column shaped as `Struct` and /// tagged with `ARROW:extension:name = "lance.blob.v2"`. pub fn blob_field(name: &str, nullable: bool) -> Field { - let metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] + blob_field_with_options(name, nullable, BlobFieldOptions::default()) +} + +/// Options for constructing a blob v2 field. +#[derive(Clone, Debug, Default)] +pub struct BlobFieldOptions { + /// Maximum payload size to keep inline in the data file before using packed blob storage. + pub inline_size_threshold: Option, + /// Maximum payload size to store in packed blob storage before using dedicated blob storage. + /// + /// A zero threshold is invalid because dedicated blob storage is selected when + /// the payload size is greater than this value. + pub dedicated_size_threshold: Option, +} + +impl BlobFieldOptions { + /// Set the maximum payload size to keep inline in the data file. + pub fn with_inline_size_threshold(mut self, threshold: usize) -> Self { + self.inline_size_threshold = Some(threshold); + self + } + + /// Set the maximum payload size to store in packed blob storage. + pub fn with_dedicated_size_threshold(mut self, threshold: NonZeroUsize) -> Self { + self.dedicated_size_threshold = Some(threshold); + self + } +} + +/// Construct the Arrow field for a blob v2 column with storage layout options. +/// +/// Blob v2 expects a column shaped as `Struct` and +/// tagged with `ARROW:extension:name = "lance.blob.v2"`. +/// +/// ``` +/// # use lance::{BlobFieldOptions, blob_field_with_options}; +/// let field = blob_field_with_options( +/// "blob", +/// true, +/// BlobFieldOptions::default().with_inline_size_threshold(16 * 1024), +/// ); +/// assert_eq!( +/// field +/// .metadata() +/// .get("lance-encoding:blob-inline-size-threshold") +/// .map(String::as_str), +/// Some("16384"), +/// ); +/// ``` +pub fn blob_field_with_options(name: &str, nullable: bool, options: BlobFieldOptions) -> Field { + let mut metadata = [(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())] .into_iter() - .collect(); + .collect::>(); + if let Some(threshold) = options.inline_size_threshold { + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.to_string(), + ); + } + if let Some(threshold) = options.dedicated_size_threshold { + metadata.insert( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), + threshold.get().to_string(), + ); + } Field::new( name, DataType::Struct( @@ -142,6 +208,8 @@ impl BlobArrayBuilder { #[cfg(test)] mod tests { + use std::num::NonZeroUsize; + use super::*; use arrow_array::Array; use arrow_array::cast::AsArray; @@ -156,6 +224,31 @@ mod tests { ); } + #[test] + fn test_field_metadata_with_options() { + let field = blob_field_with_options( + "blob", + true, + BlobFieldOptions::default() + .with_inline_size_threshold(16 * 1024) + .with_dedicated_size_threshold(NonZeroUsize::new(2 * 1024 * 1024).unwrap()), + ); + assert_eq!( + field + .metadata() + .get(BLOB_INLINE_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "16384" + ); + assert_eq!( + field + .metadata() + .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) + .unwrap(), + "2097152" + ); + } + #[test] fn test_builder_basic() { let mut b = BlobArrayBuilder::new(4); diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs index f2c243367ce..56bcc97cb32 100644 --- a/rust/lance/src/dataset/blob.rs +++ b/rust/lance/src/dataset/blob.rs @@ -19,7 +19,9 @@ use arrow_schema::DataType as ArrowDataType; use bytes::Bytes; use futures::stream::BoxStream; use futures::{FutureExt, StreamExt, TryStreamExt, stream}; -use lance_arrow::{BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, FieldExt}; +use lance_arrow::{ + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_INLINE_SIZE_THRESHOLD_META_KEY, FieldExt, +}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use lance_io::scheduler::{FileScheduler, ScanScheduler, SchedulerConfig}; use object_store::path::Path; @@ -40,6 +42,58 @@ use lance_io::utils::CachedFileSize; const INLINE_MAX: usize = 64 * 1024; // 64KB inline cutoff const DEDICATED_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB dedicated cutoff const PACK_FILE_MAX_SIZE: usize = 1024 * 1024 * 1024; // 1GiB per .pack sidecar + +pub(super) fn blob_inline_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + INLINE_MAX, + true, + ) +} + +pub(super) fn blob_dedicated_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, +) -> Result { + blob_threshold_from_metadata( + metadata, + field_name, + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + DEDICATED_THRESHOLD, + false, + ) +} + +fn blob_threshold_from_metadata( + metadata: &HashMap, + field_name: &str, + key: &str, + default_value: usize, + allow_zero: bool, +) -> Result { + let Some(value) = metadata.get(key) else { + return Ok(default_value); + }; + let threshold = value.parse::().map_err(|_| { + Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a non-negative integer that fits in usize" + )) + })?; + if !allow_zero && threshold == 0 { + return Err(Error::invalid_input(format!( + "Invalid blob threshold metadata {key}={value:?} for field '{field_name}'; \ + expected a positive integer" + ))); + } + Ok(threshold) +} + #[derive(Clone, Debug, PartialEq, Eq)] pub(super) struct ResolvedExternalBase { pub base_id: u32, @@ -206,6 +260,7 @@ pub struct BlobPreprocessor { local_counter: u32, pack_writer: PackWriter, blob_v2_cols: Vec, + inline_thresholds: Vec, dedicated_thresholds: Vec, writer_metadata: Vec>, external_base_resolver: Option>, @@ -313,7 +368,7 @@ impl BlobPreprocessor { source_store_registry: Arc, source_store_params: ObjectStoreParams, pack_file_size_threshold: Option, - ) -> Self { + ) -> Result { let mut pack_writer = PackWriter::new( object_store.clone(), data_dir.clone(), @@ -324,16 +379,37 @@ impl BlobPreprocessor { } let arrow_schema = arrow_schema::Schema::from(schema); let fields = arrow_schema.fields(); - let blob_v2_cols = fields.iter().map(|field| field.is_blob_v2()).collect(); + let blob_v2_cols = fields + .iter() + .map(|field| field.is_blob_v2()) + .collect::>(); + let inline_thresholds = fields + .iter() + .zip(blob_v2_cols.iter()) + .map(|(field, is_blob_v2)| { + if *is_blob_v2 { + blob_inline_threshold_from_metadata(field.metadata(), field.name()) + } else { + Ok(INLINE_MAX) + } + }) + .collect::>>()?; let dedicated_thresholds = fields .iter() - .map(|field| dedicated_threshold_from_metadata(field.as_ref())) - .collect(); + .zip(blob_v2_cols.iter()) + .map(|(field, is_blob_v2)| { + if *is_blob_v2 { + blob_dedicated_threshold_from_metadata(field.metadata(), field.name()) + } else { + Ok(DEDICATED_THRESHOLD) + } + }) + .collect::>>()?; let writer_metadata = fields .iter() .map(|field| field.metadata().clone()) .collect(); - Self { + Ok(Self { object_store, data_dir, data_file_key, @@ -341,6 +417,7 @@ impl BlobPreprocessor { local_counter: 1, pack_writer, blob_v2_cols, + inline_thresholds, dedicated_thresholds, writer_metadata, external_base_resolver, @@ -348,7 +425,7 @@ impl BlobPreprocessor { external_blob_mode, source_store_registry, source_store_params, - } + }) } fn next_blob_id(&mut self) -> u32 { @@ -523,6 +600,7 @@ impl BlobPreprocessor { let data_len = if has_data { data_col.value(i).len() } else { 0 }; let dedicated_threshold = self.dedicated_thresholds[idx]; + let inline_threshold = self.inline_thresholds[idx]; if has_data && data_len > dedicated_threshold { let blob_id = self.next_blob_id(); self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) @@ -537,7 +615,7 @@ impl BlobPreprocessor { continue; } - if has_data && data_len > INLINE_MAX { + if has_data && data_len > inline_threshold { let (pack_blob_id, position) = self .write_packed(BlobWriteSource::Bytes(data_col.value(i))) .await?; @@ -586,7 +664,7 @@ impl BlobPreprocessor { continue; } - if data_len > INLINE_MAX as u64 { + if data_len > inline_threshold as u64 { let (pack_blob_id, position) = self .write_packed(BlobWriteSource::External(&source)) .await?; @@ -700,16 +778,6 @@ impl BlobPreprocessor { } } -fn dedicated_threshold_from_metadata(field: &arrow_schema::Field) -> usize { - field - .metadata() - .get(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY) - .and_then(|value| value.parse::().ok()) - .filter(|value| *value > 0) - .and_then(|value| usize::try_from(value).ok()) - .unwrap_or(DEDICATED_THRESHOLD) -} - pub async fn preprocess_blob_batches( batches: &[RecordBatch], pre: &mut BlobPreprocessor, @@ -2111,7 +2179,8 @@ mod tests { use chrono::Utc; use futures::{StreamExt, TryStreamExt, future::try_join_all}; use lance_arrow::{ - ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_V2_EXT_NAME, DataTypeExt, }; use lance_core::datatypes::BlobKind; use lance_io::object_store::{ @@ -2142,7 +2211,7 @@ mod tests { use crate::{ Dataset, blob::{BlobArrayBuilder, blob_field}, - dataset::{ExternalBlobMode, WriteParams}, + dataset::{ExternalBlobMode, WriteMode, WriteParams}, utils::test::TestDatasetGenerator, }; @@ -3621,6 +3690,50 @@ mod tests { assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); } + #[tokio::test] + async fn test_blob_v2_external_ingest_respects_inline_threshold() { + let dataset_dir = TempDir::default(); + let external_dir = TempDir::default(); + let external_path = external_dir.std_path().join("external.bin"); + let payload = vec![0x5A; 2048]; + std::fs::write(&external_path, &payload).unwrap(); + let external_uri = format!("file://{}", external_path.display()); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_uri(external_uri).unwrap(); + let blob_array: arrow_array::ArrayRef = blob_builder.finish().unwrap(); + + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + let batch = RecordBatch::try_new(schema.clone(), vec![blob_array]).unwrap(); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + external_blob_mode: ExternalBlobMode::Ingest, + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let blobs = dataset.take_blobs_by_indices(&[0], "blob").await.unwrap(); + assert_eq!(blobs.len(), 1); + assert_eq!(blobs[0].kind(), BlobKind::Packed); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), payload.as_slice()); + } + #[tokio::test] async fn test_blob_v2_external_ingest_dedicated() { let dataset_dir = TempDir::default(); @@ -3713,7 +3826,10 @@ mod tests { ); } - async fn preprocess_kind_with_schema_metadata(metadata_value: &str, data_len: usize) -> u8 { + async fn try_preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> Result { let (object_store, base_path) = ObjectStore::from_uri_and_params( Arc::new(ObjectStoreRegistry::default()), "memory://blob_preprocessor", @@ -3726,10 +3842,9 @@ mod tests { let mut field = blob_field("blob", true); let mut metadata = field.metadata().clone(); - metadata.insert( - BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY.to_string(), - metadata_value.to_string(), - ); + for (key, value) in metadata_entries { + metadata.insert(key.to_string(), value); + } field = field.with_metadata(metadata); let writer_arrow_schema = Schema::new(vec![field.clone()]); @@ -3746,7 +3861,7 @@ mod tests { Arc::new(ObjectStoreRegistry::default()), ObjectStoreParams::default(), None, - ); + )?; let mut blob_builder = BlobArrayBuilder::new(1); blob_builder.push_bytes(vec![0u8; data_len]).unwrap(); @@ -3757,36 +3872,442 @@ mod tests { let batch_schema = Arc::new(Schema::new(vec![field_without_metadata])); let batch = RecordBatch::try_new(batch_schema, vec![blob_array]).unwrap(); - let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let out = preprocessor.preprocess_batch(&batch).await?; let struct_arr = out .column(0) .as_any() .downcast_ref::() .unwrap(); - struct_arr + Ok(struct_arr .column_by_name("kind") .unwrap() .as_primitive::() - .value(0) + .value(0)) + } + + async fn preprocess_kind_with_blob_metadata( + metadata_entries: Vec<(&'static str, String)>, + data_len: usize, + ) -> u8 { + try_preprocess_kind_with_blob_metadata(metadata_entries, data_len) + .await + .unwrap() } #[tokio::test] - async fn test_blob_v2_dedicated_threshold_ignores_non_positive_metadata() { - let kind = preprocess_kind_with_schema_metadata("0", 256 * 1024).await; - assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + async fn test_blob_v2_dedicated_threshold_rejects_non_positive_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "0".to_string())], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!(err.to_string().contains("expected a positive integer")); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_rejects_invalid_metadata() { + let err = try_preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + "not-a-number".to_string(), + )], + 256 * 1024, + ) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); + } + + #[tokio::test] + async fn test_blob_v2_write_rejects_invalid_inline_threshold_metadata() { + let dataset_dir = TempDir::default(); + let mut field = blob_field("blob", true); + let mut metadata = field.metadata().clone(); + metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "not-a-number".to_string(), + ); + field = field.with_metadata(metadata); + let schema = Arc::new(Schema::new(vec![field])); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(vec![0u8; 256]).unwrap(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(blob_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let result = Dataset::write( + reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("write with invalid blob threshold metadata should fail"); + }; + assert!( + err.to_string() + .contains("expected a non-negative integer that fits in usize") + ); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_smaller_metadata() { - let kind = preprocess_kind_with_schema_metadata("131072", 256 * 1024).await; + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "131072".to_string())], + 256 * 1024, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); } #[tokio::test] async fn test_blob_v2_dedicated_threshold_respects_larger_metadata() { - let kind = - preprocess_kind_with_schema_metadata("8388608", super::DEDICATED_THRESHOLD + 1024) - .await; + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + "8388608".to_string(), + )], + super::DEDICATED_THRESHOLD + 1024, + ) + .await; assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_smaller_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 2048, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_respects_larger_metadata() { + let kind = preprocess_kind_with_blob_metadata( + vec![( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + (super::INLINE_MAX + 8192).to_string(), + )], + super::INLINE_MAX + 4096, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![(BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "1024".to_string())], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_dedicated_threshold_uses_strict_greater_than() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "2048".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "1024".to_string()), + ], + 1024, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Inline as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_does_not_override_dedicated_threshold() { + let kind = preprocess_kind_with_blob_metadata( + vec![ + (BLOB_INLINE_SIZE_THRESHOLD_META_KEY, "8192".to_string()), + (BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, "4096".to_string()), + ], + 6144, + ) + .await; + assert_eq!(kind, lance_core::datatypes::BlobKind::Dedicated as u8); + } + + #[tokio::test] + async fn test_blob_v2_inline_threshold_is_per_column() { + let (object_store, base_path) = ObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "memory://blob_preprocessor", + &ObjectStoreParams::default(), + ) + .await + .unwrap(); + let object_store = object_store.as_ref().clone(); + let data_dir = base_path.clone().join("data"); + + let mut inline_field = blob_field("inline_blob", true); + let mut inline_metadata = inline_field.metadata().clone(); + inline_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "4096".to_string(), + ); + inline_field = inline_field.with_metadata(inline_metadata); + + let mut packed_field = blob_field("packed_blob", true); + let mut packed_metadata = packed_field.metadata().clone(); + packed_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + packed_field = packed_field.with_metadata(packed_metadata); + + let writer_arrow_schema = Schema::new(vec![inline_field.clone(), packed_field.clone()]); + let writer_schema = lance_core::datatypes::Schema::try_from(&writer_arrow_schema).unwrap(); + + let mut preprocessor = super::BlobPreprocessor::new( + object_store.clone(), + data_dir, + "data_file_key".to_string(), + &writer_schema, + None, + false, + ExternalBlobMode::Reference, + Arc::new(ObjectStoreRegistry::default()), + ObjectStoreParams::default(), + None, + ) + .unwrap(); + + let mut inline_builder = BlobArrayBuilder::new(1); + inline_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let inline_array: arrow_array::ArrayRef = inline_builder.finish().unwrap(); + + let mut packed_builder = BlobArrayBuilder::new(1); + packed_builder.push_bytes(vec![0u8; 2048]).unwrap(); + let packed_array: arrow_array::ArrayRef = packed_builder.finish().unwrap(); + + let batch_schema = Arc::new(Schema::new(vec![ + Field::new( + "inline_blob", + inline_field.data_type().clone(), + inline_field.is_nullable(), + ), + Field::new( + "packed_blob", + packed_field.data_type().clone(), + packed_field.is_nullable(), + ), + ])); + let batch = RecordBatch::try_new(batch_schema, vec![inline_array, packed_array]).unwrap(); + + let out = preprocessor.preprocess_batch(&batch).await.unwrap(); + let inline_kind = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + let packed_kind = out + .column(1) + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0); + + assert_eq!(inline_kind, lance_core::datatypes::BlobKind::Inline as u8); + assert_eq!(packed_kind, lance_core::datatypes::BlobKind::Packed as u8); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_explicit_inline_threshold_mismatch() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with explicit blob threshold mismatch should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_rejects_threshold_mismatch_with_non_blob_input_extension() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + ARROW_EXT_NAME_KEY.to_string(), + "some.other.extension".to_string(), + ); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + "1024".to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let result = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await; + let Err(err) = result else { + panic!("append with ignored blob threshold metadata should fail"); + }; + let message = err.to_string(); + assert!(message.contains("Cannot append data with blob threshold metadata")); + assert!(message.contains(BLOB_INLINE_SIZE_THRESHOLD_META_KEY)); + } + + #[tokio::test] + async fn test_blob_v2_append_accepts_explicit_default_inline_threshold() { + let dataset_dir = TempDir::default(); + let payload = vec![0u8; 2048]; + + let schema = Arc::new(Schema::new(vec![blob_field("blob", true)])); + let mut initial_builder = BlobArrayBuilder::new(1); + initial_builder.push_bytes(payload.clone()).unwrap(); + let initial_batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(initial_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let initial_reader = RecordBatchIterator::new(vec![Ok(initial_batch)], schema); + let dataset = Dataset::write( + initial_reader, + &dataset_dir.path_str(), + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(); + + let mut append_field = blob_field("blob", true); + let mut append_metadata = append_field.metadata().clone(); + append_metadata.insert( + BLOB_INLINE_SIZE_THRESHOLD_META_KEY.to_string(), + super::INLINE_MAX.to_string(), + ); + append_field = append_field.with_metadata(append_metadata); + let append_schema = Arc::new(Schema::new(vec![append_field])); + let mut append_builder = BlobArrayBuilder::new(1); + append_builder.push_bytes(payload).unwrap(); + let append_batch = RecordBatch::try_new( + append_schema.clone(), + vec![Arc::new(append_builder.finish().unwrap()) as ArrayRef], + ) + .unwrap(); + let append_reader = RecordBatchIterator::new(vec![Ok(append_batch)], append_schema); + + let dataset = Dataset::write( + append_reader, + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.count_rows(None).await.unwrap(), 2); + } } diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index b42b1f1cba9..37a3d584271 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -6,7 +6,10 @@ use chrono::TimeDelta; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use futures::{Stream, StreamExt, TryStreamExt}; -use lance_arrow::BLOB_META_KEY; +use lance_arrow::{ + ARROW_EXT_NAME_KEY, BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME, +}; use lance_core::datatypes::{ NullabilityComparison, OnMissing, OnTypeMismatch, SchemaCompareOptions, }; @@ -35,7 +38,9 @@ use tracing::{info, instrument}; use crate::Dataset; use crate::dataset::blob::{ - BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, preprocess_blob_batches, + BlobPreprocessor, ExternalBaseCandidate, ExternalBaseResolver, + blob_dedicated_threshold_from_metadata, blob_inline_threshold_from_metadata, + preprocess_blob_batches, }; use crate::session::Session; @@ -170,6 +175,77 @@ fn validate_external_blob_write_params(params: &WriteParams) -> Result<()> { Ok(()) } +fn validate_blob_threshold_metadata_for_append( + input_schema: &Schema, + dataset_schema: &Schema, +) -> Result<()> { + for input_field in &input_schema.fields { + let Some(dataset_field) = dataset_schema.field(&input_field.name) else { + continue; + }; + let input_is_blob_v2 = input_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + let dataset_is_blob_v2 = dataset_field + .metadata + .get(ARROW_EXT_NAME_KEY) + .is_some_and(|extension_name| extension_name == BLOB_V2_EXT_NAME); + if !input_is_blob_v2 && !dataset_is_blob_v2 { + continue; + } + + let has_inline_threshold = input_field + .metadata + .contains_key(BLOB_INLINE_SIZE_THRESHOLD_META_KEY); + let has_dedicated_threshold = input_field + .metadata + .contains_key(BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY); + if !has_inline_threshold && !has_dedicated_threshold { + continue; + } + + if has_inline_threshold { + let input_inline_threshold = + blob_inline_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_inline_threshold = + blob_inline_threshold_from_metadata(&dataset_field.metadata, &dataset_field.name)?; + if input_inline_threshold != dataset_inline_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_INLINE_SIZE_THRESHOLD_META_KEY, + input_inline_threshold, + input_field.name, + dataset_inline_threshold, + ))); + } + } + if has_dedicated_threshold { + let input_dedicated_threshold = + blob_dedicated_threshold_from_metadata(&input_field.metadata, &input_field.name)?; + let dataset_dedicated_threshold = blob_dedicated_threshold_from_metadata( + &dataset_field.metadata, + &dataset_field.name, + )?; + if input_dedicated_threshold != dataset_dedicated_threshold { + return Err(Error::invalid_input(format!( + "Cannot append data with blob threshold metadata {}={} for field '{}'; \ + the dataset schema has effective value {}. Blob thresholds for existing \ + columns are stored in the dataset schema.", + BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, + input_dedicated_threshold, + input_field.name, + dataset_dedicated_threshold, + ))); + } + } + } + + Ok(()) +} + /// Auto cleanup parameters #[derive(Debug, Clone)] pub struct AutoCleanupParams { @@ -953,6 +1029,7 @@ pub async fn write_fragments_internal( ..Default::default() }, )?; + validate_blob_threshold_metadata_for_append(&converted_schema, dataset.schema())?; let write_schema = dataset.schema().project_by_schema( &converted_schema, OnMissing::Error, @@ -1257,7 +1334,7 @@ async fn open_writer_with_options( source_store_registry, source_store_params, blob_pack_file_size_threshold, - )) + )?) } else { None }; diff --git a/rust/lance/src/lib.rs b/rust/lance/src/lib.rs index 284e10a9b6f..729cf2ffbe7 100644 --- a/rust/lance/src/lib.rs +++ b/rust/lance/src/lib.rs @@ -90,7 +90,7 @@ pub mod pb { include!(concat!(env!("OUT_DIR"), "/lance.pb.rs")); } -pub use blob::{BlobArrayBuilder, blob_field}; +pub use blob::{BlobArrayBuilder, BlobFieldOptions, blob_field, blob_field_with_options}; pub use dataset::Dataset; use lance_index::vector::DIST_COL; From c626c1ab13eaffcef6205a8594d15019e38be346 Mon Sep 17 00:00:00 2001 From: Nikolay Skovorodin Date: Tue, 16 Jun 2026 08:22:54 +0700 Subject: [PATCH 103/177] fix: record TakeExec output and I/O metrics (#7228) # Problem Closes #7227. The IoMetrics of the `take` operator are broken. The metrics such as `output_bytes=0.0 B`, `output_batches=0`, `bytes_read=0` are not visible. Example: `... Take: elapsed=350.781983256s, columns="_distance, _rowid, (vec)", metrics=[output_rows=10.00 K, elapsed_compute=302.52s, output_bytes=0.0 B, output_batches=0, batches_processed=1, bytes_read=0, iops=0, requests=0] ...` # Cause `TakeExec` was recording output row metrics inside `map_batch`, before the spawned take work had completed and before the final `RecordBatch` was emitted through DataFusion's metrics path. As a result, `output_batches`, `output_bytes`, and `ScanScheduler` I/O metrics stayed at zero even when take work read data successfully. # Solution This moves metric recording to the post-`try_buffered` result path, records the final `RecordBatch` with `BaselineMetrics::record_poll`, and records `ScanScheduler` I/O metrics after the actual take/read work completes. The stream also finalizes baseline metrics and records one final I/O snapshot when it finishes. Added a regression test covering output rows, output batches, output bytes, bytes read, IOPS, requests, and `batches_processed`. --- rust/lance/src/io/exec/take.rs | 104 ++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/rust/lance/src/io/exec/take.rs b/rust/lance/src/io/exec/take.rs index 977a9c88dce..c3642cdb043 100644 --- a/rust/lance/src/io/exec/take.rs +++ b/rust/lance/src/io/exec/take.rs @@ -4,6 +4,7 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, Mutex}; +use std::task::Poll; use arrow::array::AsArray; use arrow::compute::{TakeOptions, concat_batches}; @@ -27,6 +28,7 @@ use lance_arrow::RecordBatchExt; use lance_core::datatypes::{Field, OnMissing, Projection}; use lance_core::error::{DataFusionResult, LanceOptionExt}; use lance_core::utils::address::RowAddress; +use lance_core::utils::futures::FinallyStreamExt; use lance_core::utils::tokio::get_num_compute_intensive_cpus; use lance_core::{ROW_ADDR, ROW_ID}; use lance_io::scheduler::{ScanScheduler, SchedulerConfig}; @@ -353,10 +355,6 @@ impl TakeStream { (None, None) => {} } - self.metrics - .baseline_metrics - .record_output(new_data.num_rows()); - self.metrics.batches_processed.add(1); Ok(batch.merge_with_schema(&new_data, self.output_schema.as_ref())?) } @@ -364,8 +362,10 @@ impl TakeStream { self: Arc, input: S, ) -> impl Stream> { - let scan_scheduler = self.scan_scheduler.clone(); - let metrics = self.metrics.clone(); + let result_scan_scheduler = self.scan_scheduler.clone(); + let final_scan_scheduler = self.scan_scheduler.clone(); + let result_metrics = self.metrics.clone(); + let final_metrics = self.metrics.clone(); let batches = input .enumerate() .map(move |(batch_index, batch)| { @@ -378,8 +378,24 @@ impl TakeStream { }) .boxed(); batches - .inspect_ok(move |_| metrics.io_metrics.record(&scan_scheduler)) .try_buffered(get_num_compute_intensive_cpus()) + .map(move |result| { + if result.is_ok() { + result_metrics.batches_processed.add(1); + } + result_metrics.io_metrics.record(&result_scan_scheduler); + match result_metrics + .baseline_metrics + .record_poll(Poll::Ready(Some(result))) + { + Poll::Ready(Some(result)) => result, + _ => unreachable!("record_poll returned a different poll state"), + } + }) + .finally(move || { + final_metrics.baseline_metrics.done(); + final_metrics.io_metrics.record(&final_scan_scheduler); + }) } } @@ -839,6 +855,80 @@ mod tests { } } + #[tokio::test(flavor = "current_thread")] + async fn test_take_records_output_and_io_metrics() { + use datafusion::physical_plan::metrics::MetricValue; + use lance_datafusion::utils::{BYTES_READ_METRIC, IOPS_METRIC, REQUESTS_METRIC}; + let TestFixture { + dataset, + _tmp_dir_guard, + } = test_fixture().await; + + let row_addrs = UInt64Array::from(vec![0_u64, 1, 2, 3, 4]); + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + ROW_ADDR, + DataType::UInt64, + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(row_addrs)]).unwrap(); + let stream = futures::stream::iter(vec![Ok(batch)]); + let stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream)); + let input = Arc::new(OneShotExec::new(stream)); + + let projection = dataset + .empty_projection() + .union_column("s", OnMissing::Error) + .unwrap(); + + let take_exec = TakeExec::try_new(dataset, input, projection) + .unwrap() + .unwrap(); + + let stream = take_exec + .execute(0, Arc::new(TaskContext::default())) + .unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 5); + + let metrics = take_exec.metrics().unwrap(); + + let output_batches: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBatches(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let output_bytes: usize = metrics + .iter() + .filter_map(|m| match m.value() { + MetricValue::OutputBytes(count) => Some(count.value()), + _ => None, + }) + .sum(); + + let gauge = |name: &str| -> usize { + metrics + .iter_gauges() + .find_map(|(metric_name, gauge)| { + (metric_name.as_ref() == name).then(|| gauge.value()) + }) + .unwrap_or(0) + }; + + let bytes_read = gauge(BYTES_READ_METRIC); + let iops = gauge(IOPS_METRIC); + let requests = gauge(REQUESTS_METRIC); + + assert_eq!(metrics.output_rows(), Some(5)); + assert_eq!(metrics.find_count("batches_processed").unwrap().value(), 1); + assert!( + output_batches > 0 && output_bytes > 0 && bytes_read > 0 && iops > 0 && requests > 0, + "expected positive TakeExec metrics, got output_batches={output_batches}, output_bytes={output_bytes}, bytes_read={bytes_read}, iops={iops}, requests={requests}" + ); + } + #[tokio::test] async fn test_take_order() { let TestFixture { From 8743fbaf9ce0dce15373629a7574e14d5c6c9367 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Tue, 16 Jun 2026 13:13:34 +0800 Subject: [PATCH 104/177] fix(index): share FTS position stream offsets (#7275) ## Bug Fix ### What is the bug? FTS phrase-query position cache hits clone `SharedPositionStream`. The stream already used shared `Bytes` for the compressed position payload, but `block_offsets` was stored as `Vec`, so each clone copied one offset per compressed position block. ### What issues or incorrect behavior does the bug cause? For large shared-position streams, repeated cache-hit clones can add avoidable O(N) CPU and memory traffic even when the position data itself is cached. ### How does this PR fix the problem? `SharedPositionStream` now stores `block_offsets` as `Arc<[u32]>`, keeping the existing `block_offsets() -> &[u32]` API while making clones share the offset allocation. A regression test verifies that cloned streams point at the same offsets allocation. ## Validation - `cargo test -p lance-index scalar::inverted::index::tests::test_shared_position_stream_clone_shares_block_offsets` - `cargo test -p lance-index shared_position` - `cargo test -p lance-index shared_stream` - `cargo test -p lance-index scalar::inverted::cache_codec::tests` - `cargo test -p lance-index` - `cargo fmt --all` - `cargo clippy --all --tests --benches -- -D warnings` - `git diff --check` --- rust/lance-index/src/scalar/inverted/index.rs | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 56547c6510b..92379a9e350 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -2947,10 +2947,9 @@ impl DeepSizeOf for CompressedPositionStorage { #[derive(Debug, Clone, PartialEq, Eq, Default)] pub struct SharedPositionStream { codec: PositionStreamCodec, - block_offsets: Vec, - // Stored as `Bytes` so that the cache deserialization path can hand - // ownership of an IPC-decoded slice in without copying. Cloning the - // stream is then an `Arc` bump rather than an O(N) buffer copy. + block_offsets: Arc<[u32]>, + // Stored with shared ownership so cache hits can clone position streams + // without copying either offsets or bytes. bytes: bytes::Bytes, } @@ -2958,7 +2957,7 @@ impl SharedPositionStream { pub fn new(codec: PositionStreamCodec, block_offsets: Vec, bytes: bytes::Bytes) -> Self { Self { codec, - block_offsets, + block_offsets: Arc::from(block_offsets.into_boxed_slice()), bytes, } } @@ -2991,11 +2990,11 @@ impl SharedPositionStream { } pub fn block_offsets(&self) -> &[u32] { - &self.block_offsets + self.block_offsets.as_ref() } pub fn size(&self) -> usize { - self.block_offsets.capacity() * std::mem::size_of::() + self.bytes.len() + self.block_offsets.len() * std::mem::size_of::() + self.bytes.len() } } @@ -5475,6 +5474,21 @@ mod tests { ); } + #[test] + fn test_shared_position_stream_clone_shares_block_offsets() { + let stream = SharedPositionStream::new( + PositionStreamCodec::PackedDelta, + vec![0_u32, 4, 11], + bytes::Bytes::from_static(b"shared position bytes"), + ); + let original_offsets = stream.block_offsets().as_ptr(); + + let cloned = stream.clone(); + + assert_eq!(cloned.block_offsets(), stream.block_offsets()); + assert_eq!(cloned.block_offsets().as_ptr(), original_offsets); + } + #[test] fn test_posting_builder_roundtrip_shared_positions() { let entries = vec![ From 3c2e36e1b22cf8059059190140a6e2adc7b979ce Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 16 Jun 2026 03:01:18 -0500 Subject: [PATCH 105/177] fix(mem_wal): push LIMIT/OFFSET down to fresh-tier scan sources (#7256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary The LSM scan planner (`LsmScanPlanner`) applied a query's `LIMIT`/`OFFSET` *above* the per-source union, so every flushed generation and the base table were fully scanned before the limit was imposed at the top. For a bounded read over a fresh tier with many flushed generations this materialized the entire tier on every query. This threads an `overfetch_factor` through `LsmScanner` → `LsmScanPlanner` and pushes a per-source fetch of `(offset + limit) * overfetch` into each `Base`/`Flushed` source scan. The active memtable is exempt (its within-source dedup needs the full match set). Block-listed sources over-fetch by `overfetch_factor` so cross-generation PK dedup still leaves ~`limit` live rows; a `LocalLimit` re-imposes the exact `skip`/`fetch` cap above the merge. Unbounded reads (no limit) are unchanged. ## Changes - `scanner/planner.rs`: `overfetch_factor` field + `with_overfetch_factor` on `LsmScanPlanner`; per-source limit pushdown in `plan_scan`; `fetch` parameter on `build_source_scan` (pushed into Base/Flushed scans, ActiveMemTable exempt). - `scanner/builder.rs`: `overfetch_factor` on `LsmScanner` + `with_overfetch_factor`, threaded into both the scan and FTS planners. ## Validation Validated end-to-end against a WAL benchmark on minikube with object storage behind a latency proxy: a bounded `offset+limit` read over an 18-generation fresh tier dropped from a full-tier scan (~360 generation scans) to a bounded per-source read. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Fable 5 --- .../src/dataset/mem_wal/scanner/builder.rs | 19 +++++ .../src/dataset/mem_wal/scanner/planner.rs | 70 +++++++++++++++++-- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index ade4164d485..508605c4642 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -125,6 +125,9 @@ pub struct LsmScanner { /// Cache of opened flushed-generation datasets. When set, repeated /// queries against the same generation skip the manifest read entirely. flushed_cache: Option>, + /// Over-fetch multiple for block-listed sources in search plans + /// (see [`super::LsmFtsSearchPlanner::with_overfetch_factor`]). + overfetch_factor: Option, } impl LsmScanner { @@ -160,6 +163,7 @@ impl LsmScanner { pk_columns, session, flushed_cache: None, + overfetch_factor: None, } } @@ -198,6 +202,7 @@ impl LsmScanner { pk_columns, session: None, flushed_cache: None, + overfetch_factor: None, } } @@ -253,6 +258,14 @@ impl LsmScanner { self } + /// Set the over-fetch multiple block-listed sources use in search plans + /// so they still yield `k` live rows after cross-generation dedup. + /// Threaded into [`super::LsmFtsSearchPlanner`]; clamped to `>= 1.0`. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = Some(factor); + self + } + /// Project specific columns. /// /// If not called, all columns from the base schema are included. @@ -370,6 +383,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_scan( @@ -405,6 +421,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(factor) = self.overfetch_factor { + planner = planner.with_overfetch_factor(factor); + } planner .plan_search(column, query, k, self.projection.as_deref()) .await diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index f3f15e2e680..eca0255be1c 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -34,6 +34,10 @@ pub struct LsmScanPlanner { session: Option>, /// Cache of opened flushed-generation datasets. flushed_cache: Option>, + /// Over-fetch multiple for the per-source limit pushdown: block-listed + /// sources scan `(offset + limit) * factor` rows so cross-gen dedup drops + /// still leave enough live rows. Clamped to `>= 1.0`. + overfetch_factor: f64, } impl LsmScanPlanner { @@ -49,6 +53,7 @@ impl LsmScanPlanner { base_schema, session: None, flushed_cache: None, + overfetch_factor: 1.0, } } @@ -66,6 +71,13 @@ impl LsmScanPlanner { self } + /// Set the over-fetch multiple for the per-source limit pushdown + /// (see the field docs). Clamped to `>= 1.0` at use. + pub fn with_overfetch_factor(mut self, factor: f64) -> Self { + self.overfetch_factor = factor; + self + } + /// Create scan plan with deduplication. /// /// # Arguments @@ -130,23 +142,59 @@ impl LsmScanPlanner { // cross-gen block-list, not from output ordering. let sources: Vec<_> = sources.into_iter().rev().collect(); + // Per-source limit pushdown: an unordered LIMIT needs only + // `offset + limit` live rows from EACH source to fill the global + // limit after dedup (any-N semantics), so cap every on-disk source + // instead of scanning whole generations and trimming above the + // union. Block-listed sources over-fetch by `overfetch_factor` so + // cross-gen dedup drops still leave `n_needed` live rows; the + // PkHashFilter warns when that was not enough. The active memtable + // is in-memory and within-gen append duplicates are resolved by its + // own dedup, so it is never capped here. + let n_needed = limit.map(|l| l.saturating_add(offset.unwrap_or(0))); + let overfetch = self.overfetch_factor.max(1.0); + let mut source_plans = Vec::new(); for source in sources { let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let scan = self.build_source_scan(&source, projection, filter).await?; + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists + .get(&(source.shard_id(), source.generation())) + .cloned(); + let fetch = match (n_needed, is_active) { + (Some(n), false) => Some(if blocked.is_some() { + ((n as f64) * overfetch).ceil() as usize + } else { + n + }), + _ => None, + }; + let scan = self + .build_source_scan(&source, projection, filter, fetch) + .await?; // Drop cross-generation stale rows (PKs superseded by a newer gen). - // `k = 0`: there is no top-k, so the under-fetch warning never fires. - let scan = match block_lists.get(&(source.shard_id(), source.generation())) { + // With a limit, `k = n_needed` arms the under-fetch warning; with + // no limit `k = 0` keeps it silent. + let scan = match blocked { Some(set) => Arc::new(PkHashFilterExec::new( scan, self.pk_columns.clone(), - set.clone(), - 0, + set, + n_needed.unwrap_or(0), )) as Arc, None => scan, }; + // Post-block-list cap: each source contributes at most `n_needed` + // live rows toward the global limit. + let scan: Arc = match n_needed { + Some(n) if !is_active => Arc::new( + datafusion::physical_plan::limit::LocalLimitExec::new(scan, n), + ), + _ => scan, + }; + // When `_rowaddr` is surfaced, NULL it for non-base arms: only base // values are meaningful (e.g. for `take_rows`); per-source addresses // collide with base IDs. @@ -229,6 +277,7 @@ impl LsmScanPlanner { source: &LsmDataSource, projection: Option<&[String]>, filter: Option<&Expr>, + fetch: Option, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -247,6 +296,11 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown (post-filter rows): bounds the + // physical scan instead of trimming above the union. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } @@ -264,6 +318,12 @@ impl LsmScanPlanner { if let Some(expr) = filter { scanner.filter_expr(expr.clone()); } + // Per-source limit pushdown: flushed generations are + // within-gen live (dedup-on-flush deletion vectors), so any + // `fetch` post-filter rows are valid contributions. + if let Some(fetch) = fetch { + scanner.limit(Some(fetch as i64), None)?; + } scanner.create_plan().await } From a558da54c4a4ce6b9ffe68b7cc4f68472775b702 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 16 Jun 2026 16:31:15 +0800 Subject: [PATCH 106/177] fix: reject nested legacy blobs in v2.2 (#7278) This closes a v2.2 write-path hole where legacy blob metadata was rejected only on top-level fields. A nested field such as `summary_image_nested.image_bytes` could still carry `lance-encoding:blob` into a v2.2 dataset manifest. The write guard now scans the full Lance schema preorder and reports the offending field path. This keeps v2.2 dataset creation from silently preserving BlobV1 columns while leaving valid `lance.blob.v2` inputs untouched. --- rust/lance/src/dataset/write.rs | 18 +++++++++---- rust/lance/src/dataset/write/insert.rs | 37 +++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 37a3d584271..d6ac3ce9d45 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1069,13 +1069,10 @@ pub async fn write_fragments_internal( } if storage_version >= LanceFileVersion::V2_2 - && schema - .fields - .iter() - .any(|f| f.metadata.contains_key(BLOB_META_KEY)) + && let Some(blob_field_path) = legacy_blob_field_path(&schema) { return Err(Error::invalid_input(format!( - "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." + "Legacy blob columns (field metadata key {BLOB_META_KEY:?}) are not supported for file version >= 2.2. Found legacy blob field: {blob_field_path}. Use the blob v2 extension type (ARROW:extension:name = \"lance.blob.v2\") and the new blob APIs (e.g. lance::blob::blob_field / lance::blob::BlobArrayBuilder)." ))); } @@ -1094,6 +1091,17 @@ pub async fn write_fragments_internal( Ok((fragments, schema)) } +fn legacy_blob_field_path(schema: &Schema) -> Option { + schema + .fields_pre_order() + .find(|field| field.metadata.contains_key(BLOB_META_KEY)) + .map(|field| { + schema + .field_path(field.id) + .unwrap_or_else(|_| field.name.clone()) + }) +} + #[async_trait::async_trait] pub trait GenericWriter: Send { /// Write the given batches to the file diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index 20209ed7f30..bfd702c9c3b 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -442,7 +442,7 @@ struct WriteContext<'a> { mod test { use std::collections::HashMap; - use arrow_array::{BinaryArray, Int32Array, RecordBatchReader, StructArray}; + use arrow_array::{ArrayRef, BinaryArray, Int32Array, RecordBatchReader, StructArray}; use arrow_schema::{ArrowError, DataType, Field, Schema}; use lance_arrow::BLOB_META_KEY; @@ -559,6 +559,41 @@ mod test { } } + #[tokio::test] + async fn create_v2_2_dataset_rejects_nested_legacy_blob_schema() { + let image_field = Field::new("image_bytes", DataType::Binary, true).with_metadata( + HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]), + ); + let schema = Arc::new(Schema::new(vec![Field::new( + "summary_image_nested", + DataType::Struct(vec![image_field.clone()].into()), + true, + )])); + let image_values: ArrayRef = Arc::new(BinaryArray::from(vec![Some(b"abc".as_slice())])); + let nested_values = StructArray::from(vec![(Arc::new(image_field), image_values)]); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(nested_values)]).unwrap(); + + let dataset = InsertBuilder::new("memory://forced-nested-blob-v2") + .with_params(&WriteParams { + mode: WriteMode::Create, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }) + .execute_stream(RecordBatchIterator::new(vec![Ok(batch)], schema.clone())) + .await; + + let err = dataset.unwrap_err(); + match err { + Error::InvalidInput { source, .. } => { + let message = source.to_string(); + assert!(message.contains("Legacy blob columns")); + assert!(message.contains("summary_image_nested.image_bytes")); + assert!(message.contains("lance.blob.v2")); + } + other => panic!("unexpected error: {other:?}"), + } + } + mod external_error { use super::*; use std::fmt; From 440acf0685dc2f92a8298ae591d1a4b7c1312e30 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 16 Jun 2026 17:50:49 +0800 Subject: [PATCH 107/177] fix: support blob v2 in nested structs (#7281) Blob v2 write support only handled top-level fields, so blob v2 columns nested under structs could reach the writer with the logical layout while the file footer expected descriptor fields. This PR makes blob v2 preprocessing and footer descriptor unloading walk nested structs, while preserving the existing per-field blob threshold handling. It also applies the same storage-version and external-reference checks to nested blob v2 fields that top-level blob v2 fields already use. Validated locally with the required Rust and Python checks plus targeted nested blob v2 read/write coverage. --- python/python/tests/test_blob.py | 32 ++ rust/lance-core/src/datatypes/field.rs | 60 ++ rust/lance-file/src/writer.rs | 13 +- rust/lance/src/dataset/blob.rs | 746 +++++++++++++++++-------- rust/lance/src/dataset/write.rs | 7 +- 5 files changed, 602 insertions(+), 256 deletions(-) diff --git a/python/python/tests/test_blob.py b/python/python/tests/test_blob.py index cb34ae3cafc..fc879c9cbaa 100644 --- a/python/python/tests/test_blob.py +++ b/python/python/tests/test_blob.py @@ -1460,6 +1460,38 @@ def test_read_blobs_resolves_nested_field_path(dataset_with_nested_blobs): assert [data for _, data in results] == [b"foo", b"baz"] +def test_write_nested_blob_v2_and_take_by_field_path(tmp_path): + packed = b"x" * (70 * 1024) + blob_field = lance.blob_field("blob") + info_fields = [pa.field("name", pa.string()), blob_field] + info_type = pa.struct(info_fields) + info_array = pa.StructArray.from_arrays( + [pa.array(["a", "b", "c"]), lance.blob_array([b"foo", packed, None])], + fields=info_fields, + ) + table = pa.table( + [info_array], + schema=pa.schema([pa.field("info", info_type)]), + ) + + dataset = lance.write_dataset( + table, + tmp_path / "nested_blob_v2", + data_storage_version="2.2", + ) + + desc = dataset.to_table(columns=["info.blob"]).column("info.blob").chunk(0) + assert desc.field("kind").to_pylist()[:2] == [0, 1] + + blobs = dataset.take_blobs("info.blob", indices=[0, 1]) + with blobs[0] as f: + assert f.read() == b"foo" + with blobs[1] as f: + assert f.read() == packed + + assert dataset.take_blobs("info.blob", indices=[2]) == [] + + def test_to_pandas_returns_blob_files_for_projected_nested_fields( dataset_with_nested_blobs, ): diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index 4c2665a3640..9f06d421949 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -575,6 +575,18 @@ impl Field { } } + /// Convert blob v2 fields in this field tree to their descriptor view. + pub fn unload_blobs_recursive(&mut self) { + if self.is_blob_v2() { + self.unloaded_mut(); + return; + } + + for child in &mut self.children { + child.unload_blobs_recursive(); + } + } + pub fn project(&self, path_components: &[&str]) -> Result { let mut f = Self { name: self.name.clone(), @@ -1864,6 +1876,54 @@ mod tests { assert_eq!(field.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); } + #[test] + fn unload_blobs_recursive_only_unloads_blob_v2() { + let legacy_metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); + let blob_v2_metadata = + HashMap::from([(ARROW_EXT_NAME_KEY.to_string(), BLOB_V2_EXT_NAME.to_string())]); + + let mut field: Field = ArrowField::new( + "parent", + DataType::Struct(Fields::from(vec![ + ArrowField::new("legacy_blob", DataType::LargeBinary, true) + .with_metadata(legacy_metadata), + ArrowField::new( + "blob_v2", + DataType::Struct( + vec![ + ArrowField::new("data", DataType::LargeBinary, true), + ArrowField::new("uri", DataType::Utf8, true), + ] + .into(), + ), + true, + ) + .with_metadata(blob_v2_metadata), + ])), + true, + ) + .try_into() + .unwrap(); + + field.unload_blobs_recursive(); + + let legacy_blob = field + .children + .iter() + .find(|f| f.name == "legacy_blob") + .unwrap(); + assert_eq!( + legacy_blob.logical_type, + LogicalType::try_from(&DataType::LargeBinary).unwrap() + ); + assert_eq!(legacy_blob.children.len(), 0); + assert!(legacy_blob.metadata.contains_key(BLOB_META_KEY)); + + let blob_v2 = field.children.iter().find(|f| f.name == "blob_v2").unwrap(); + assert_eq!(blob_v2.logical_type, BLOB_V2_DESC_LANCE_FIELD.logical_type); + assert_eq!(blob_v2.children.len(), 5); + } + #[test] fn project_by_field_accepts_blob_descriptor_projection() { let metadata = HashMap::from([(BLOB_META_KEY.to_string(), "true".to_string())]); diff --git a/rust/lance-file/src/writer.rs b/rust/lance-file/src/writer.rs index 14a4c82bde6..12bd50df6fe 100644 --- a/rust/lance-file/src/writer.rs +++ b/rust/lance-file/src/writer.rs @@ -633,14 +633,11 @@ impl FileWriter { async fn write_global_buffers(&mut self) -> Result> { let schema = self.schema.as_mut().ok_or(Error::invalid_input("No schema provided on writer open and no data provided. Schema is unknown and file cannot be created"))?; schema.metadata = std::mem::take(&mut self.schema_metadata); - // Use descriptor layout for blob v2 in the footer to avoid exposing logical child fields. - // - // TODO(xuanwo): this doesn't work on nested struct, need better solution like fields_per_order_mut? - schema.fields.iter_mut().for_each(|f| { - if f.is_blob_v2() { - f.unloaded_mut(); - } - }); + // Use descriptor layout for blob v2 fields in the footer to avoid exposing logical child fields. + schema + .fields + .iter_mut() + .for_each(|f| f.unload_blobs_recursive()); let file_descriptor = Self::make_file_descriptor(schema, self.rows_written)?; let file_descriptor_bytes = file_descriptor.encode_to_vec(); diff --git a/rust/lance/src/dataset/blob.rs b/rust/lance/src/dataset/blob.rs index 56bcc97cb32..8cdde543e4e 100644 --- a/rust/lance/src/dataset/blob.rs +++ b/rust/lance/src/dataset/blob.rs @@ -12,15 +12,17 @@ use std::{ use arrow::array::AsArray; use arrow::datatypes::{UInt8Type, UInt32Type, UInt64Type}; -use arrow_array::Array; use arrow_array::RecordBatch; use arrow_array::builder::{LargeBinaryBuilder, PrimitiveBuilder, StringBuilder}; -use arrow_schema::DataType as ArrowDataType; +use arrow_array::{Array, ArrayRef}; +use arrow_schema::{DataType as ArrowDataType, Field as ArrowField}; use bytes::Bytes; +use futures::future::BoxFuture; use futures::stream::BoxStream; use futures::{FutureExt, StreamExt, TryStreamExt, stream}; use lance_arrow::{ BLOB_DEDICATED_SIZE_THRESHOLD_META_KEY, BLOB_INLINE_SIZE_THRESHOLD_META_KEY, FieldExt, + r#struct::StructArrayExt, }; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use lance_io::scheduler::{FileScheduler, ScanScheduler, SchedulerConfig}; @@ -259,10 +261,7 @@ pub struct BlobPreprocessor { data_file_key: String, local_counter: u32, pack_writer: PackWriter, - blob_v2_cols: Vec, - inline_thresholds: Vec, - dedicated_thresholds: Vec, - writer_metadata: Vec>, + field_processors: Vec, external_base_resolver: Option>, allow_external_blob_outside_bases: bool, external_blob_mode: ExternalBlobMode, @@ -287,6 +286,64 @@ enum BlobWriteSource<'a> { External(&'a ExternalBlobSource), } +#[derive(Clone, Debug)] +struct BlobPreprocessField { + kind: BlobPreprocessFieldKind, +} + +#[derive(Clone, Debug)] +enum BlobPreprocessFieldKind { + BlobV2 { + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: HashMap, + }, + Struct { + children: Vec, + }, + Passthrough, +} + +impl BlobPreprocessField { + fn new(field: &ArrowField) -> Result { + if field.is_blob_v2() { + return Ok(Self { + kind: BlobPreprocessFieldKind::BlobV2 { + inline_threshold: blob_inline_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + dedicated_threshold: blob_dedicated_threshold_from_metadata( + field.metadata(), + field.name(), + )?, + writer_metadata: field.metadata().clone(), + }, + }); + } + + if let ArrowDataType::Struct(children) = field.data_type() { + let children = children + .iter() + .map(|child| Self::new(child.as_ref())) + .collect::>>()?; + if children.iter().any(|child| child.requires_preprocessing()) { + return Ok(Self { + kind: BlobPreprocessFieldKind::Struct { children }, + }); + } + } + + Ok(Self { + kind: BlobPreprocessFieldKind::Passthrough, + }) + } + + fn requires_preprocessing(&self) -> bool { + !matches!(self.kind, BlobPreprocessFieldKind::Passthrough) + } +} + impl ExternalBlobSource { /// Return the logical payload size after applying any external slice. fn size(&self) -> u64 { @@ -378,37 +435,11 @@ impl BlobPreprocessor { pack_writer.max_pack_size = max_bytes; } let arrow_schema = arrow_schema::Schema::from(schema); - let fields = arrow_schema.fields(); - let blob_v2_cols = fields + let field_processors = arrow_schema + .fields() .iter() - .map(|field| field.is_blob_v2()) - .collect::>(); - let inline_thresholds = fields - .iter() - .zip(blob_v2_cols.iter()) - .map(|(field, is_blob_v2)| { - if *is_blob_v2 { - blob_inline_threshold_from_metadata(field.metadata(), field.name()) - } else { - Ok(INLINE_MAX) - } - }) + .map(|field| BlobPreprocessField::new(field.as_ref())) .collect::>>()?; - let dedicated_thresholds = fields - .iter() - .zip(blob_v2_cols.iter()) - .map(|(field, is_blob_v2)| { - if *is_blob_v2 { - blob_dedicated_threshold_from_metadata(field.metadata(), field.name()) - } else { - Ok(DEDICATED_THRESHOLD) - } - }) - .collect::>>()?; - let writer_metadata = fields - .iter() - .map(|field| field.metadata().clone()) - .collect(); Ok(Self { object_store, data_dir, @@ -416,10 +447,7 @@ impl BlobPreprocessor { // Start at 1 to avoid a potential all-zero blob_id value. local_counter: 1, pack_writer, - blob_v2_cols, - inline_thresholds, - dedicated_thresholds, - writer_metadata, + field_processors, external_base_resolver, allow_external_blob_outside_bases, external_blob_mode, @@ -520,7 +548,7 @@ impl BlobPreprocessor { } pub(crate) async fn preprocess_batch(&mut self, batch: &RecordBatch) -> Result { - let expected_columns = self.blob_v2_cols.len(); + let expected_columns = self.field_processors.len(); if batch.num_columns() != expected_columns { return Err(Error::invalid_input(format!( "Unexpected number of columns: expected {}, got {}", @@ -531,246 +559,340 @@ impl BlobPreprocessor { let batch_schema = batch.schema(); let batch_fields = batch_schema.fields(); + let field_processors = self.field_processors.clone(); let mut new_columns = Vec::with_capacity(batch.num_columns()); let mut new_fields = Vec::with_capacity(batch.num_columns()); - for idx in 0..batch.num_columns() { - let array = batch.column(idx); - let field = &batch_fields[idx]; - if !self.blob_v2_cols[idx] { - new_columns.push(array.clone()); - new_fields.push(field.clone()); + for ((processor, array), field) in field_processors + .iter() + .zip(batch.columns().iter()) + .zip(batch_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(processor, array.clone(), field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( + new_fields + .iter() + .map(|f| f.as_ref().clone()) + .collect::>(), + batch_schema.metadata().clone(), + )); + + RecordBatch::try_new(new_schema, new_columns) + .map_err(|e| Error::invalid_input(e.to_string())) + } + + fn preprocess_field<'a>( + &'a mut self, + processor: &'a BlobPreprocessField, + array: ArrayRef, + field: &'a Arc, + ) -> BoxFuture<'a, Result<(ArrayRef, Arc)>> { + async move { + match &processor.kind { + BlobPreprocessFieldKind::Passthrough => Ok((array, field.clone())), + BlobPreprocessFieldKind::BlobV2 { + inline_threshold, + dedicated_threshold, + writer_metadata, + } => { + self.preprocess_blob_array( + array, + field.as_ref(), + *inline_threshold, + *dedicated_threshold, + writer_metadata, + ) + .await + } + BlobPreprocessFieldKind::Struct { children } => { + self.preprocess_struct_array(array, field.as_ref(), children) + .await + } + } + } + .boxed() + } + + async fn preprocess_struct_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + children: &[BlobPreprocessField], + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Struct field was not a struct array"))?; + if struct_arr.num_columns() != children.len() { + return Err(Error::invalid_input(format!( + "Struct field '{}' expected {} children, got {}", + field.name(), + children.len(), + struct_arr.num_columns() + ))); + } + + let struct_arr = struct_arr.normalize_slicing()?; + let parent_nulls = struct_arr.nulls().cloned(); + let pushed_down = struct_arr.pushdown_nulls()?; + let child_fields = pushed_down.fields().clone(); + let child_columns = pushed_down.columns().to_vec(); + + let mut new_columns = Vec::with_capacity(children.len()); + let mut new_fields = Vec::with_capacity(children.len()); + for ((child_processor, child_array), child_field) in children + .iter() + .zip(child_columns.into_iter()) + .zip(child_fields.iter()) + { + let (new_column, new_field) = self + .preprocess_field(child_processor, child_array, child_field) + .await?; + new_columns.push(new_column); + new_fields.push(new_field); + } + + let struct_array = + StructArray::try_new(new_fields.clone().into(), new_columns, parent_nulls)?; + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(new_fields.into()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + Ok((Arc::new(struct_array), field)) + } + + async fn preprocess_blob_array( + &mut self, + array: ArrayRef, + field: &ArrowField, + inline_threshold: usize, + dedicated_threshold: usize, + writer_metadata: &HashMap, + ) -> Result<(ArrayRef, Arc)> { + let struct_arr = array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; + + let data_col = struct_arr + .column_by_name("data") + .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? + .as_binary::(); + let uri_col = struct_arr + .column_by_name("uri") + .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? + .as_string::(); + let position_col = struct_arr + .column_by_name("position") + .map(|col| col.as_primitive::()); + let size_col = struct_arr + .column_by_name("size") + .map(|col| col.as_primitive::()); + + let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); + let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); + let mut blob_id_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut blob_size_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); + let mut position_builder = + PrimitiveBuilder::::with_capacity(struct_arr.len()); + + let struct_nulls = struct_arr.nulls(); + + for i in 0..struct_arr.len() { + if struct_arr.is_null(i) { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); continue; } - let struct_arr = array - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::invalid_input("Blob column was not a struct array"))?; - - let data_col = struct_arr - .column_by_name("data") - .ok_or_else(|| Error::invalid_input("Blob struct missing `data` field"))? - .as_binary::(); - let uri_col = struct_arr - .column_by_name("uri") - .ok_or_else(|| Error::invalid_input("Blob struct missing `uri` field"))? - .as_string::(); - let position_col = struct_arr - .column_by_name("position") - .map(|col| col.as_primitive::()); - let size_col = struct_arr - .column_by_name("size") - .map(|col| col.as_primitive::()); - - let mut data_builder = LargeBinaryBuilder::with_capacity(struct_arr.len(), 0); - let mut uri_builder = StringBuilder::with_capacity(struct_arr.len(), 0); - let mut blob_id_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut blob_size_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut kind_builder = PrimitiveBuilder::::with_capacity(struct_arr.len()); - let mut position_builder = - PrimitiveBuilder::::with_capacity(struct_arr.len()); - - let struct_nulls = struct_arr.nulls(); - - for i in 0..struct_arr.len() { - if struct_arr.is_null(i) { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); - kind_builder.append_null(); - position_builder.append_null(); - continue; - } + let has_data = !data_col.is_null(i); + let has_uri = !uri_col.is_null(i); + let has_position = position_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let has_size = size_col + .as_ref() + .map(|col| !col.is_null(i)) + .unwrap_or(false); + let data_len = if has_data { data_col.value(i).len() } else { 0 }; - let has_data = !data_col.is_null(i); - let has_uri = !uri_col.is_null(i); - let has_position = position_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let has_size = size_col - .as_ref() - .map(|col| !col.is_null(i)) - .unwrap_or(false); - let data_len = if has_data { data_col.value(i).len() } else { 0 }; - - let dedicated_threshold = self.dedicated_thresholds[idx]; - let inline_threshold = self.inline_thresholds[idx]; - if has_data && data_len > dedicated_threshold { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_null(); - continue; - } + if has_data && data_len > dedicated_threshold { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_data && data_len > inline_threshold { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::Bytes(data_col.value(i))) - .await?; + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_null(); + continue; + } - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len as u64); - position_builder.append_value(position); - continue; - } + if has_data && data_len > inline_threshold { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::Bytes(data_col.value(i))) + .await?; - if has_uri { - let uri_val = uri_col.value(i); - if self.external_blob_mode == ExternalBlobMode::Ingest { - let position = if has_position { - Some( - position_col - .as_ref() - .expect("position column must exist") - .value(i), - ) - } else { - None - }; - let size = if has_size { - Some(size_col.as_ref().expect("size column must exist").value(i)) - } else { - None - }; - let source = self.open_external_source(uri_val, position, size).await?; - let data_len = source.size(); - - if data_len > dedicated_threshold as u64 { - let blob_id = self.next_blob_id(); - self.write_dedicated(blob_id, BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Dedicated as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_null(); - continue; - } + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len as u64); + position_builder.append_value(position); + continue; + } - if data_len > inline_threshold as u64 { - let (pack_blob_id, position) = self - .write_packed(BlobWriteSource::External(&source)) - .await?; - - kind_builder.append_value(BlobKind::Packed as u8); - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_value(pack_blob_id); - blob_size_builder.append_value(data_len); - position_builder.append_value(position); - continue; - } + if has_uri { + let uri_val = uri_col.value(i); + if self.external_blob_mode == ExternalBlobMode::Ingest { + let position = if has_position { + Some( + position_col + .as_ref() + .expect("position column must exist") + .value(i), + ) + } else { + None + }; + let size = if has_size { + Some(size_col.as_ref().expect("size column must exist").value(i)) + } else { + None + }; + let source = self.open_external_source(uri_val, position, size).await?; + let data_len = source.size(); - let data = source.read_all().await?; + if data_len > dedicated_threshold as u64 { + let blob_id = self.next_blob_id(); + self.write_dedicated(blob_id, BlobWriteSource::External(&source)) + .await?; - kind_builder.append_value(BlobKind::Inline as u8); - data_builder.append_value(data.as_ref()); + kind_builder.append_value(BlobKind::Dedicated as u8); + data_builder.append_null(); uri_builder.append_null(); - blob_id_builder.append_null(); - blob_size_builder.append_null(); + blob_id_builder.append_value(blob_id); + blob_size_builder.append_value(data_len); position_builder.append_null(); continue; } - let (external_base_id, external_uri_or_path) = - self.resolve_external_reference(uri_val).await?; - kind_builder.append_value(BlobKind::External as u8); - data_builder.append_null(); - uri_builder.append_value(external_uri_or_path); - blob_id_builder.append_value(external_base_id); - if has_position && has_size { - let position = position_col - .as_ref() - .expect("position column must exist") - .value(i); - let size = size_col.as_ref().expect("size column must exist").value(i); - blob_size_builder.append_value(size); + if data_len > inline_threshold as u64 { + let (pack_blob_id, position) = self + .write_packed(BlobWriteSource::External(&source)) + .await?; + + kind_builder.append_value(BlobKind::Packed as u8); + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_value(pack_blob_id); + blob_size_builder.append_value(data_len); position_builder.append_value(position); - } else { - blob_size_builder.append_null(); - position_builder.append_null(); + continue; } - continue; - } - if has_data { + let data = source.read_all().await?; + kind_builder.append_value(BlobKind::Inline as u8); - let value = data_col.value(i); - data_builder.append_value(value); + data_builder.append_value(data.as_ref()); uri_builder.append_null(); blob_id_builder.append_null(); blob_size_builder.append_null(); position_builder.append_null(); + continue; + } + + let (external_base_id, external_uri_or_path) = + self.resolve_external_reference(uri_val).await?; + kind_builder.append_value(BlobKind::External as u8); + data_builder.append_null(); + uri_builder.append_value(external_uri_or_path); + blob_id_builder.append_value(external_base_id); + if has_position && has_size { + let position = position_col + .as_ref() + .expect("position column must exist") + .value(i); + let size = size_col.as_ref().expect("size column must exist").value(i); + blob_size_builder.append_value(size); + position_builder.append_value(position); } else { - data_builder.append_null(); - uri_builder.append_null(); - blob_id_builder.append_null(); blob_size_builder.append_null(); - kind_builder.append_null(); position_builder.append_null(); } + continue; } - let child_fields = vec![ - arrow_schema::Field::new("kind", ArrowDataType::UInt8, true), - arrow_schema::Field::new("data", ArrowDataType::LargeBinary, true), - arrow_schema::Field::new("uri", ArrowDataType::Utf8, true), - arrow_schema::Field::new("blob_id", ArrowDataType::UInt32, true), - arrow_schema::Field::new("blob_size", ArrowDataType::UInt64, true), - arrow_schema::Field::new("position", ArrowDataType::UInt64, true), - ]; - - let struct_array = arrow_array::StructArray::try_new( - child_fields.clone().into(), - vec![ - Arc::new(kind_builder.finish()), - Arc::new(data_builder.finish()), - Arc::new(uri_builder.finish()), - Arc::new(blob_id_builder.finish()), - Arc::new(blob_size_builder.finish()), - Arc::new(position_builder.finish()), - ], - struct_nulls.cloned(), - )?; - - new_columns.push(Arc::new(struct_array)); - new_fields.push(Arc::new( - arrow_schema::Field::new( - field.name(), - ArrowDataType::Struct(child_fields.into()), - field.is_nullable(), - ) - .with_metadata(self.writer_metadata[idx].clone()), - )); + if has_data { + kind_builder.append_value(BlobKind::Inline as u8); + let value = data_col.value(i); + data_builder.append_value(value); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + position_builder.append_null(); + } else { + data_builder.append_null(); + uri_builder.append_null(); + blob_id_builder.append_null(); + blob_size_builder.append_null(); + kind_builder.append_null(); + position_builder.append_null(); + } } - let new_schema = Arc::new(arrow_schema::Schema::new_with_metadata( - new_fields - .iter() - .map(|f| f.as_ref().clone()) - .collect::>(), - batch_schema.metadata().clone(), - )); + let child_fields = vec![ + ArrowField::new("kind", ArrowDataType::UInt8, true), + ArrowField::new("data", ArrowDataType::LargeBinary, true), + ArrowField::new("uri", ArrowDataType::Utf8, true), + ArrowField::new("blob_id", ArrowDataType::UInt32, true), + ArrowField::new("blob_size", ArrowDataType::UInt64, true), + ArrowField::new("position", ArrowDataType::UInt64, true), + ]; - RecordBatch::try_new(new_schema, new_columns) - .map_err(|e| Error::invalid_input(e.to_string())) + let struct_array = StructArray::try_new( + child_fields.clone().into(), + vec![ + Arc::new(kind_builder.finish()), + Arc::new(data_builder.finish()), + Arc::new(uri_builder.finish()), + Arc::new(blob_id_builder.finish()), + Arc::new(blob_size_builder.finish()), + Arc::new(position_builder.finish()), + ], + struct_nulls.cloned(), + )?; + + let field = Arc::new( + ArrowField::new( + field.name(), + ArrowDataType::Struct(child_fields.into()), + field.is_nullable(), + ) + .with_metadata(writer_metadata.clone()), + ); + Ok((Arc::new(struct_array), field)) } pub(crate) async fn finish(&mut self) -> Result<()> { @@ -2171,7 +2293,7 @@ mod tests { }; use arrow_array::RecordBatch; use arrow_array::{ - ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, + Array, ArrayRef, RecordBatchIterator, StringArray, StructArray, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; use async_trait::async_trait; @@ -2227,6 +2349,32 @@ mod tests { expected: Vec, } + fn nested_blob_v2_batch(blob_array: ArrayRef) -> (Arc, RecordBatch) { + let blob_field = blob_field("blob", true); + let info_fields = vec![Field::new("name", DataType::Utf8, false), blob_field]; + let info_array: ArrayRef = Arc::new( + StructArray::try_new( + info_fields.clone().into(), + vec![ + Arc::new(StringArray::from_iter_values( + (0..blob_array.len()).map(|idx| format!("name-{idx}")), + )) as ArrayRef, + blob_array, + ], + None, + ) + .unwrap(), + ); + + let schema = Arc::new(Schema::new(vec![Field::new( + "info", + DataType::Struct(info_fields.into()), + true, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![info_array]).unwrap(); + (schema, batch) + } + #[cfg(feature = "azure")] fn azure_store_params(account_name: &str) -> ObjectStoreParams { ObjectStoreParams { @@ -3114,6 +3262,114 @@ mod tests { assert_eq!(second.as_ref(), b"world"); } + #[tokio::test] + async fn test_write_and_take_nested_blob_v2() { + let test_dir = TempStrDir::default(); + let packed_payload = vec![0x4A; super::INLINE_MAX + 1024]; + + let mut blob_builder = BlobArrayBuilder::new(3); + blob_builder.push_bytes(b"hello").unwrap(); + blob_builder.push_bytes(&packed_payload).unwrap(); + blob_builder.push_null().unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let dataset = Arc::new( + Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }), + ) + .await + .unwrap(), + ); + + let info_batch = dataset + .scan() + .project(&["info"]) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let blob_desc = info_batch + .column(0) + .as_struct() + .column_by_name("blob") + .unwrap() + .as_struct(); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(0), + BlobKind::Inline as u8 + ); + assert_eq!( + blob_desc + .column_by_name("kind") + .unwrap() + .as_primitive::() + .value(1), + BlobKind::Packed as u8 + ); + + let blobs = dataset + .take_blobs_by_indices(&[0, 1], "info.blob") + .await + .unwrap(); + assert_eq!(blobs.len(), 2); + assert_eq!(blobs[0].read().await.unwrap().as_ref(), b"hello"); + assert_eq!( + blobs[1].read().await.unwrap().as_ref(), + packed_payload.as_slice() + ); + + let null_blobs = dataset + .take_blobs_by_indices(&[2], "info.blob") + .await + .unwrap(); + assert!(null_blobs.is_empty()); + } + + #[tokio::test] + async fn test_nested_blob_v2_requires_v2_2() { + let test_dir = TempStrDir::default(); + + let mut blob_builder = BlobArrayBuilder::new(1); + blob_builder.push_bytes(b"hello").unwrap(); + let blob_array: ArrayRef = blob_builder.finish().unwrap(); + + let (schema, batch) = nested_blob_v2_batch(blob_array); + let reader = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + + let result = Dataset::write( + reader, + &test_dir, + Some(WriteParams { + data_storage_version: Some(LanceFileVersion::V2_1), + ..Default::default() + }), + ) + .await; + + assert!( + result.is_err(), + "Nested blob v2 should be rejected for file version 2.1" + ); + assert!( + result + .unwrap_err() + .to_string() + .contains("Blob v2 requires file version >= 2.2") + ); + } + #[tokio::test] async fn test_blob_file_read_empty_range_returns_empty_bytes() { let store = reject_empty_range_store(); diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index d6ac3ce9d45..ff0a119158c 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -583,7 +583,7 @@ pub async fn do_write_fragments( }; let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 - && schema.fields.iter().any(|field| field.is_blob_v2()) + && schema.fields_pre_order().any(|field| field.is_blob_v2()) { Some(Arc::new( build_external_base_resolver(dataset, ¶ms).await?, @@ -1061,7 +1061,8 @@ pub async fn write_fragments_internal( (converted_schema, params.storage_version_or_default()) }; - if storage_version < LanceFileVersion::V2_2 && schema.fields.iter().any(|f| f.is_blob_v2()) { + if storage_version < LanceFileVersion::V2_2 && schema.fields_pre_order().any(|f| f.is_blob_v2()) + { return Err(Error::invalid_input(format!( "Blob v2 requires file version >= 2.2 (got {:?})", storage_version @@ -1242,7 +1243,7 @@ pub(super) async fn open_update_writer( // flow through WriteParams. Rebuild the external base resolver here so blob // v2 reference columns can resolve dataset-registered external URIs. let external_base_resolver = if storage_version >= LanceFileVersion::V2_2 - && schema.fields.iter().any(|f| f.is_blob_v2()) + && schema.fields_pre_order().any(|f| f.is_blob_v2()) { Some(Arc::new( build_external_base_resolver(Some(dataset), &WriteParams::default()).await?, From 705cef584a10e2e938636b4efc6b2d1048e12ee6 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 16 Jun 2026 08:44:01 -0700 Subject: [PATCH 108/177] fix: make FTS finalization idempotent (#7272) ## Summary Make distributed FTS finalization idempotent and object-store friendly. The old finalize flow was not retry-safe because it moved build-only `part__*` partition files before writing the final `metadata.lance`; if the process stopped in that window, a retry saw no final metadata but the original source files had already been deleted. The new flow treats `metadata.lance` as the commit marker and keeps all pre-commit partition sources under stable `staging/` paths. Finalization copies staged files to dense final names without deleting the staged sources or canonical final paths first, writes final metadata, then removes staging best-effort. Any leftover `staging/` objects are filtered from committed FTS `IndexMetadata.files`, so cleanup failures do not make transient files part of the committed index. ## Implementation - Write distributed build-only partition data and per-partition metadata under stable `staging/part__*` paths. - Keep the non-distributed builder path writing final `part__*` files and final `metadata.lance` directly, with no dense rewrite. - Add `IndexStore::copy_index_file_to(source, dest, dest_store)` for source-preserving copy-to-new-name; `LanceIndexStore` implements it with object-store copy and supports nested relative paths like `staging/...`. - Copy existing root partition files into staging for the public distributed-from-existing path, `from_existing_index(..., fragment_mask=Some(...))`, before advertising those partitions in staged metadata. - During `merge_index_files`, discover staged metadata, sort original partition IDs, map them densely to `0..N`, copy `staging/part__*` to final `part__*` without pre-deleting final paths, and then write final `metadata.lance`. - Delete staged partition data and staged per-partition metadata only after final metadata is written, and keep that cleanup best-effort. - Filter leftover `staging/` files from FTS segment `IndexMetadata.files` during segment commit. - On retry, existing final `metadata.lance` makes finalize a no-op; without it, staged sources remain authoritative and any partial final files are replaceable from staging. Added regressions for staged distributed writes, distributed-from-existing finalize, partial final-file retry without root `part_*` deletion, preserving staging when final metadata writing fails, and excluding stale staging files from committed metadata. Validated locally with Rust fmt/tests/clippy, segmented FTS integration tests, Python extension build, the targeted Python progress test, and `uv run make lint-rust`. --- rust/lance-index/src/scalar.rs | 16 + .../src/scalar/inverted/builder.rs | 955 +++++++++++++++--- rust/lance-index/src/scalar/inverted/index.rs | 10 + rust/lance-index/src/scalar/lance_format.rs | 47 +- rust/lance/src/index.rs | 14 +- rust/lance/src/index/create.rs | 30 + 6 files changed, 944 insertions(+), 128 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index daec92339f8..4830586f85c 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -288,6 +288,22 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf { /// This is often useful when remapping or updating async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result; + /// Copy an index file from this store to a new name in another store, leaving the source intact + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + if name == new_name { + self.copy_index_file(name, dest_store).await + } else { + Err(Error::not_supported(format!( + "copying index file {name} to {new_name} is not supported by this index store" + ))) + } + } + /// Rename an index file async fn rename_index_file(&self, name: &str, new_name: &str) -> Result; diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 24b1eb50203..93932f35332 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -407,11 +407,21 @@ impl InvertedIndexBuilder { ) -> Result> { let partition_id = self.next_partition_id() | self.fragment_mask.unwrap_or(0); builder.set_id(partition_id); - let files = builder.write(dest_store).await?; + let files = builder + .write_to(dest_store, self.partition_write_target()) + .await?; self.new_partitions.push(partition_id); Ok(files) } + fn partition_write_target(&self) -> PartitionWriteTarget { + if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + } + } + fn next_partition_id(&self) -> u64 { self.partitions .iter() @@ -523,7 +533,11 @@ impl InvertedIndexBuilder { if let Some(builder) = merged_tail_partitions { self.new_partitions.push(builder.id()); let mut builder = builder; - files.extend(builder.write(dest_store.as_ref()).await?); + files.extend( + builder + .write_to(dest_store.as_ref(), self.partition_write_target()) + .await?, + ); } log::info!("wait workers indexing elapsed: {:?}", start.elapsed()); Result::Ok(files) @@ -550,12 +564,16 @@ impl InvertedIndexBuilder { .await?; let mut builder = part.into_builder().await?; builder.remap(mapping).await?; - files.extend(builder.write(dest_store).await?); + files.extend( + builder + .write_to(dest_store, self.partition_write_target()) + .await?, + ); } if self.fragment_mask.is_none() { files.push(self.write_metadata(dest_store, &self.partitions).await?); } else { - // in distributed mode, the part_temp_metadata is written by the worker + // in distributed mode, the staged partition metadata is written by the worker for &partition_id in &self.partitions { files.push(self.write_part_metadata(dest_store, partition_id).await?); } @@ -709,26 +727,35 @@ impl InvertedIndexBuilder { .await?; let mut copied = 0; let mut files = Vec::new(); + let target = self.partition_write_target(); for part in self.partitions.iter() { files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&token_file_path(*part), dest_store) + .copy_index_file_to( + &token_file_path(*part), + &target.token_path(*part), + dest_store, + ) .await?, ); files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&posting_file_path(*part), dest_store) + .copy_index_file_to( + &posting_file_path(*part), + &target.posting_path(*part), + dest_store, + ) .await?, ); files.push( self.src_store .as_ref() .expect("existing partitions require a source store") - .copy_index_file(&doc_file_path(*part), dest_store) + .copy_index_file_to(&doc_file_path(*part), &target.doc_path(*part), dest_store) .await?, ); copied += 1; @@ -986,11 +1013,22 @@ impl InnerBuilder { } pub async fn write(&mut self, store: &dyn IndexStore) -> Result> { + self.write_to(store, PartitionWriteTarget::Final).await + } + + async fn write_to( + &mut self, + store: &dyn IndexStore, + target: PartitionWriteTarget, + ) -> Result> { let docs = Arc::new(std::mem::take(&mut self.docs)); let files = vec![ - self.write_posting_lists(store, docs.clone()).await?, - self.write_tokens(store).await?, - self.write_docs(store, docs).await?, + self.write_posting_lists(store, docs.clone(), &target.posting_path(self.id)) + .await?, + self.write_tokens(store, &target.token_path(self.id)) + .await?, + self.write_docs(store, docs, &target.doc_path(self.id)) + .await?, ]; Ok(files) } @@ -1000,11 +1038,12 @@ impl InnerBuilder { &mut self, store: &dyn IndexStore, docs: Arc, + path: &str, ) -> Result { let id = self.id; let mut writer = store .new_index_file( - &posting_file_path(self.id), + path, inverted_list_schema_for_version(self.with_position, self.format_version), ) .await?; @@ -1090,29 +1129,57 @@ impl InnerBuilder { } #[instrument(level = "debug", skip_all)] - async fn write_tokens(&mut self, store: &dyn IndexStore) -> Result { + async fn write_tokens(&mut self, store: &dyn IndexStore, path: &str) -> Result { log::info!("writing tokens of partition {}", self.id); let tokens = std::mem::take(&mut self.tokens); let batch = tokens.to_batch(self.token_set_format)?; - let mut writer = store - .new_index_file(&token_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; writer.finish().await } #[instrument(level = "debug", skip_all)] - async fn write_docs(&mut self, store: &dyn IndexStore, docs: Arc) -> Result { + async fn write_docs( + &mut self, + store: &dyn IndexStore, + docs: Arc, + path: &str, + ) -> Result { log::info!("writing docs of partition {}", self.id); let batch = docs.to_batch()?; - let mut writer = store - .new_index_file(&doc_file_path(self.id), batch.schema()) - .await?; + let mut writer = store.new_index_file(path, batch.schema()).await?; writer.write_record_batch(batch).await?; writer.finish().await } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PartitionWriteTarget { + Final, + Staged, +} + +impl PartitionWriteTarget { + fn file_path(self, partition_id: u64, suffix: &str) -> String { + match self { + Self::Final => partition_file_path(partition_id, suffix), + Self::Staged => staged_partition_file_path(partition_id, suffix), + } + } + + fn token_path(self, partition_id: u64) -> String { + self.file_path(partition_id, TOKENS_FILE) + } + + fn posting_path(self, partition_id: u64) -> String { + self.file_path(partition_id, INVERT_LIST_FILE) + } + + fn doc_path(self, partition_id: u64) -> String { + self.file_path(partition_id, DOCS_FILE) + } +} + struct IndexWorker { tokenizer: Box, dest_store: Arc, @@ -1430,8 +1497,13 @@ impl IndexWorker { ); let written_partition_id = builder.id(); let mut builder = builder; + let target = if self.fragment_mask.is_some() { + PartitionWriteTarget::Staged + } else { + PartitionWriteTarget::Final + }; let files = builder - .write(self.dest_store.as_ref()) + .write_to(self.dest_store.as_ref(), target) .await .map_err(|err| { Error::execution(format!( @@ -1782,14 +1854,23 @@ pub(crate) fn doc_file_path(partition_id: u64) -> String { } pub(crate) fn part_metadata_file_path(partition_id: u64) -> String { - format!("part_{}_{}", partition_id, METADATA_FILE) + staged_partition_file_path(partition_id, METADATA_FILE) } const PARTITION_FILE_SUFFIXES: [&str; 3] = [TOKENS_FILE, INVERT_LIST_FILE, DOCS_FILE]; -// Each remapped file is renamed twice: first to a temp path (phase 1), then to -// its final path (phase 2). Keep in sync with the two rename loops below in -// `merge_metadata_files`. -const PARTITION_FILE_RENAME_PHASES: u64 = 2; +const STAGED_PARTITION_DIR: &str = "staging"; + +fn partition_file_path(partition_id: u64, suffix: &str) -> String { + format!("part_{}_{}", partition_id, suffix) +} + +fn staged_partition_file_path(partition_id: u64, suffix: &str) -> String { + format!( + "{}/{}", + STAGED_PARTITION_DIR, + partition_file_path(partition_id, suffix) + ) +} pub async fn merge_index_files( object_store: &ObjectStore, @@ -1797,33 +1878,65 @@ pub async fn merge_index_files( store: Arc, progress: Arc, ) -> Result<()> { - // List all partition metadata files in the index directory - let part_metadata_files = list_metadata_files(object_store, index_dir).await?; + let metadata_path = index_dir.clone().join(METADATA_FILE); + if object_store.exists(&metadata_path).await? { + return Ok(()); + } + + // List all staged partition metadata files in the index directory + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); + if part_metadata_files.is_empty() { + return Err(Error::invalid_input_source( + format!( + "No partition metadata files found in index directory: {}", + index_dir + ) + .into(), + )); + } // Call merge_metadata_files function for inverted index merge_metadata_files(store, &part_metadata_files, progress).await } -/// List and filter metadata files from the index directory -/// Returns partition metadata files -async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { - // List all partition metadata files in the index directory - let mut part_metadata_files = Vec::new(); - let mut list_stream = object_store.list(Some(index_dir.clone())); +async fn list_index_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let mut index_files = Vec::new(); + let mut list_stream = object_store.read_dir_all(index_dir, None); while let Some(item) = list_stream.next().await { match item { Ok(meta) => { - let file_name = meta.location.filename().unwrap_or_default(); - // Filter files matching the pattern part_*_metadata.lance - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - part_metadata_files.push(file_name.to_string()); - } + let location = meta.location.as_ref().trim_start_matches('/'); + let index_dir = index_dir.as_ref().trim_start_matches('/'); + let relative_path = location + .strip_prefix(index_dir) + .map(|s| s.trim_start_matches('/').to_string()) + .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string()); + index_files.push(relative_path); } Err(err) => return Err(err), } } + Ok(index_files) +} + +fn metadata_files(index_files: &[String]) -> Vec { + index_files + .iter() + .filter(|file_name| { + file_name.starts_with(&format!("{}/part_", STAGED_PARTITION_DIR)) + && file_name.ends_with("_metadata.lance") + }) + .cloned() + .collect() +} + +#[cfg(test)] +async fn list_metadata_files(object_store: &ObjectStore, index_dir: &Path) -> Result> { + let index_files = list_index_files(object_store, index_dir).await?; + let part_metadata_files = metadata_files(&index_files); if part_metadata_files.is_empty() { return Err(Error::invalid_input_source( format!( @@ -1914,89 +2027,35 @@ async fn merge_metadata_files( progress.stage_complete("read_partition_metadata").await?; // Create ID mapping: sorted original IDs -> 0,1,2... - let mut sorted_ids = all_partitions.clone(); + let mut sorted_ids = all_partitions; sorted_ids.sort(); sorted_ids.dedup(); - let id_mapping: HashMap = sorted_ids + let id_mapping: Vec<(u64, u64)> = sorted_ids .iter() .enumerate() .map(|(new_id, &old_id)| (old_id, new_id as u64)) .collect(); - // Safe rename partition files using temporary files to avoid overwrite - let timestamp = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_secs(); - - let changed_partition_count = id_mapping - .iter() - .filter(|(old_id, new_id)| old_id != new_id) - .count() as u64; - let total_renames = changed_partition_count - * PARTITION_FILE_SUFFIXES.len() as u64 - * PARTITION_FILE_RENAME_PHASES; + let total_copies = id_mapping.len() as u64 * PARTITION_FILE_SUFFIXES.len() as u64; progress - .stage_start("remap_partition_files", Some(total_renames), "files") + .stage_start("remap_partition_files", Some(total_copies), "files") .await?; - // Phase 1: Move files to temporary locations - let mut temp_files: Vec<(String, String, String)> = Vec::new(); // (temp_path, old_path, final_path) - let mut renamed_files = 0u64; + let mut copied_files = 0u64; - for (&old_id, &new_id) in &id_mapping { - if old_id != new_id { - for suffix in PARTITION_FILE_SUFFIXES { - let old_path = format!("part_{}_{}", old_id, suffix); - let new_path = format!("part_{}_{}", new_id, suffix); - let temp_path = format!("temp_{}_{}", timestamp, old_path); - - // Move to temporary location first to avoid overwrite - if let Err(e) = store.rename_index_file(&old_path, &temp_path).await { - // Rollback phase 1: restore files from temp locations - for (temp_name, old_name, _) in temp_files.iter().rev() { - let _ = store.rename_index_file(temp_name, old_name).await; - } - return Err(Error::index(format!( - "Failed to move {} to temp {}: {}", - old_path, temp_path, e - ))); - } - temp_files.push((temp_path, old_path, new_path)); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; - } - } - } - - // Phase 2: Move from temporary to final locations - let mut completed_renames: Vec<(String, String)> = Vec::new(); // (final_path, temp_path) - - for (temp_path, _old_path, final_path) in &temp_files { - if let Err(e) = store.rename_index_file(temp_path, final_path).await { - // Rollback phase 2: restore completed renames and remaining temps - for (final_name, temp_name) in completed_renames.iter().rev() { - let _ = store.rename_index_file(final_name, temp_name).await; - } - // Restore remaining temp files to original locations - for (temp_name, orig_name, _) in temp_files.iter() { - if !completed_renames.iter().any(|(_, t)| t == temp_name) { - let _ = store.rename_index_file(temp_name, orig_name).await; - } - } - return Err(Error::index(format!( - "Failed to rename {} to {}: {}", - temp_path, final_path, e - ))); + for &(old_id, new_id) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(old_id, suffix); + let final_path = partition_file_path(new_id, suffix); + store + .copy_index_file_to(&staged_path, &final_path, store.as_ref()) + .await?; + copied_files += 1; + progress + .stage_progress("remap_partition_files", copied_files) + .await?; } - completed_renames.push((final_path.clone(), temp_path.clone())); - renamed_files += 1; - progress - .stage_progress("remap_partition_files", renamed_files) - .await?; } progress.stage_complete("remap_partition_files").await?; @@ -2023,10 +2082,15 @@ async fn merge_metadata_files( progress.stage_progress("write_merged_metadata", 1).await?; progress.stage_complete("write_merged_metadata").await?; - // Cleanup partition metadata files + // Cleanup staged partition metadata files for file_name in part_metadata_files { - if file_name.starts_with("part_") && file_name.ends_with("_metadata.lance") { - let _ = store.delete_index_file(file_name).await; + let _ = store.delete_index_file(file_name).await; + } + for &(old_id, _) in &id_mapping { + for suffix in PARTITION_FILE_SUFFIXES { + let _ = store + .delete_index_file(&staged_partition_file_path(old_id, suffix)) + .await; } } @@ -2246,6 +2310,234 @@ mod tests { } } + #[derive(Debug, Clone)] + struct NoRenameStore { + inner: Arc, + final_delete_count: Option>, + } + + impl NoRenameStore { + fn new(inner: Arc) -> Self { + Self { + inner, + final_delete_count: None, + } + } + + fn with_final_delete_tracking(inner: Arc) -> Self { + Self { + inner, + final_delete_count: Some(Arc::new(AtomicUsize::new(0))), + } + } + + fn final_delete_count(&self) -> usize { + self.final_delete_count + .as_ref() + .map(|count| count.load(Ordering::SeqCst)) + .unwrap_or_default() + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for NoRenameStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for NoRenameStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(self.clone()) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + self.inner.new_index_file(name, schema).await + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + Err(Error::internal(format!( + "merge_index_files should not rename partition file {name} to {new_name}" + ))) + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + if name.starts_with("part_") + && let Some(count) = &self.final_delete_count + { + count.fetch_add(1, Ordering::SeqCst); + } + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + #[derive(Debug)] + struct FailMetadataStore { + inner: Arc, + } + + impl FailMetadataStore { + fn new(inner: Arc) -> Self { + Self { inner } + } + + fn unwrap_dest_store(dest_store: &dyn IndexStore) -> &dyn IndexStore { + dest_store + .as_any() + .downcast_ref::() + .map(|store| store.inner.as_ref()) + .unwrap_or(dest_store) + } + } + + impl DeepSizeOf for FailMetadataStore { + fn deep_size_of_children(&self, context: &mut lance_core::deepsize::Context) -> usize { + self.inner.deep_size_of_children(context) + } + } + + #[async_trait] + impl IndexStore for FailMetadataStore { + fn as_any(&self) -> &dyn Any { + self + } + + fn clone_arc(&self) -> Arc { + Arc::new(Self { + inner: self.inner.clone(), + }) + } + + fn io_parallelism(&self) -> usize { + self.inner.io_parallelism() + } + + async fn new_index_file( + &self, + name: &str, + schema: Arc, + ) -> Result> { + let writer = self.inner.new_index_file(name, schema).await?; + if name == METADATA_FILE { + Ok(Box::new(FailFinishWriter { inner: writer })) + } else { + Ok(writer) + } + } + + async fn open_index_file(&self, name: &str) -> Result> { + self.inner.open_index_file(name).await + } + + async fn copy_index_file( + &self, + name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file(name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, Self::unwrap_dest_store(dest_store)) + .await + } + + async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { + self.inner.rename_index_file(name, new_name).await + } + + async fn delete_index_file(&self, name: &str) -> Result<()> { + self.inner.delete_index_file(name).await + } + + async fn list_files_with_sizes(&self) -> Result> { + self.inner.list_files_with_sizes().await + } + } + + struct FailFinishWriter { + inner: Box, + } + + #[async_trait] + impl IndexWriter for FailFinishWriter { + async fn write_record_batch(&mut self, batch: RecordBatch) -> Result { + self.inner.write_record_batch(batch).await + } + + async fn add_global_buffer(&mut self, data: Bytes) -> Result { + self.inner.add_global_buffer(data).await + } + + async fn finish(&mut self) -> Result { + Err(Error::internal("injected metadata write failure")) + } + + async fn finish_with_metadata( + &mut self, + _metadata: HashMap, + ) -> Result { + Err(Error::internal("injected metadata write failure")) + } + } + #[derive(Debug)] struct CountingWriter { path: String, @@ -2412,12 +2704,446 @@ mod tests { let store = CountingStore::new(); let docs = Arc::new(std::mem::take(&mut builder.docs)); - builder.write_posting_lists(&store, docs).await?; + builder + .write_posting_lists(&store, docs, &posting_file_path(0)) + .await?; assert_eq!(store.write_count(), 1); Ok(()) } + async fn write_partition_file_marker( + store: &dyn IndexStore, + path: &str, + partition_id: u64, + ) -> Result<()> { + let schema = Arc::new(Schema::new(vec![Field::new( + "partition_id", + DataType::UInt64, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![partition_id]))], + )?; + let mut writer = store.new_index_file(path, schema).await?; + writer.write_record_batch(batch).await?; + writer.finish().await?; + Ok(()) + } + + async fn write_partition_files( + store: &dyn IndexStore, + partition_id: u64, + target: PartitionWriteTarget, + ) -> Result<()> { + write_partition_file_marker(store, &target.token_path(partition_id), partition_id).await?; + write_partition_file_marker(store, &target.posting_path(partition_id), partition_id) + .await?; + write_partition_file_marker(store, &target.doc_path(partition_id), partition_id).await?; + Ok(()) + } + + async fn read_partition_file_marker(store: &dyn IndexStore, path: &str) -> Result { + let reader = store.open_index_file(path).await?; + let batch = reader.read_range(0..1, None).await?; + let partition_ids = batch.column(0).as_primitive::(); + Ok(partition_ids.value(0)) + } + + async fn assert_partition_file_markers( + store: &dyn IndexStore, + partition_id: u64, + expected_marker: u64, + ) -> Result<()> { + assert_eq!( + read_partition_file_marker(store, &token_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &posting_file_path(partition_id)).await?, + expected_marker + ); + assert_eq!( + read_partition_file_marker(store, &doc_file_path(partition_id)).await?, + expected_marker + ); + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_remaps_staged_partitions_without_rename() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::new(base_store.clone())); + let partitions = vec![5_u64, 1_u64, (17_u64 << 32) | 2]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + let metadata_reader = base_store.open_index_file(METADATA_FILE).await?; + let metadata = &metadata_reader.schema().metadata; + let written_partitions: Vec = serde_json::from_str( + metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + expected_partitions.dedup(); + let remapped_partitions = (0..expected_partitions.len() as u64).collect::>(); + assert_eq!(written_partitions, remapped_partitions); + + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(base_store.as_ref(), new_id as u64, *old_id).await?; + assert!( + base_store + .open_index_file(&part_metadata_file_path(*old_id)) + .await + .is_err(), + "partition metadata should be cleaned up after final metadata is written" + ); + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + base_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned up after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_rewrites_partial_final_files_from_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let store = Arc::new(NoRenameStore::with_final_delete_tracking( + base_store.clone(), + )); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + for suffix in PARTITION_FILE_SUFFIXES { + write_partition_file_marker(base_store.as_ref(), &partition_file_path(1, suffix), 999) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store.clone(), + noop_progress(), + ) + .await?; + + assert_partition_file_markers(base_store.as_ref(), 0, 1).await?; + assert_partition_file_markers(base_store.as_ref(), 1, 5).await?; + assert_eq!( + store.final_delete_count(), + 0, + "merge should overwrite final partition files without deleting them first" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_from_existing_copies_existing_partitions_to_staging_and_finalizes() + -> Result<()> { + let object_store = Arc::new(ObjectStore::local()); + let source_dir = TempDir::default(); + let dest_dir = TempDir::default(); + let source_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + source_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let dest_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + dest_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let merge_store = Arc::new(NoRenameStore::new(dest_store.clone())); + let fragment_mask = 7_u64 << 32; + let partitions = vec![fragment_mask | 5, fragment_mask | 1]; + + for partition_id in &partitions { + write_partition_files( + source_store.as_ref(), + *partition_id, + PartitionWriteTarget::Final, + ) + .await?; + } + + let builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + Some(source_store.clone()), + partitions.clone(), + TokenSetFormat::default(), + Some(fragment_mask), + RoaringBitmap::new(), + ); + builder.write(dest_store.as_ref()).await?; + + for partition_id in &partitions { + assert_partition_file_markers(source_store.as_ref(), *partition_id, *partition_id) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(dest_store.as_ref(), &staged_path).await?, + *partition_id + ); + assert!( + dest_store + .open_index_file(&partition_file_path(*partition_id, suffix)) + .await + .is_err(), + "distributed existing partition should be staged instead of copied to root" + ); + } + dest_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + } + + merge_index_files( + object_store.as_ref(), + &dest_dir.obj_path(), + merge_store, + noop_progress(), + ) + .await?; + + let mut expected_partitions = partitions.clone(); + expected_partitions.sort_unstable(); + for (new_id, old_id) in expected_partitions.iter().enumerate() { + assert_partition_file_markers(dest_store.as_ref(), new_id as u64, *old_id).await?; + for suffix in PARTITION_FILE_SUFFIXES { + assert!( + dest_store + .open_index_file(&staged_partition_file_path(*old_id, suffix)) + .await + .is_err(), + "staged partition files should be cleaned after final metadata is written" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_keeps_staging_when_final_metadata_write_fails() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let base_store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let failing_store = Arc::new(FailMetadataStore::new(base_store.clone())); + let partitions = vec![1_u64, 5_u64]; + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + Vec::new(), + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + + for partition_id in &partitions { + write_partition_files( + base_store.as_ref(), + *partition_id, + PartitionWriteTarget::Staged, + ) + .await?; + metadata_builder + .write_part_metadata(base_store.as_ref(), *partition_id) + .await?; + } + + let err = merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + failing_store, + noop_progress(), + ) + .await + .unwrap_err(); + assert!( + err.to_string().contains("metadata write failure"), + "expected injected metadata failure, got: {err}" + ); + + for partition_id in &partitions { + base_store + .open_index_file(&part_metadata_file_path(*partition_id)) + .await?; + for suffix in PARTITION_FILE_SUFFIXES { + let staged_path = staged_partition_file_path(*partition_id, suffix); + assert_eq!( + read_partition_file_marker(base_store.as_ref(), &staged_path).await?, + *partition_id + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_distributed_build_writes_partition_data_to_staging() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = ObjectStore::local(); + let store = Arc::new(LanceIndexStore::new( + object_store.into(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + + let fragment_mask = 7_u64 << 32; + let batch = make_doc_batch("hello world", fragment_mask); + let stream = RecordBatchStreamAdapter::new(batch.schema(), stream::iter(vec![Ok(batch)])); + let stream = Box::pin(stream); + let mut builder = InvertedIndexBuilder::new_with_fragment_mask( + InvertedIndexParams::default(), + Some(fragment_mask), + ); + builder.update(stream, store.as_ref(), None).await?; + + let part_metadata_files = + list_metadata_files(&ObjectStore::local(), &index_dir.obj_path()).await?; + assert_eq!(part_metadata_files.len(), 1); + assert!( + part_metadata_files[0].starts_with("staging/part_"), + "partition metadata should be written to staging" + ); + let reader = store.open_index_file(&part_metadata_files[0]).await?; + let partition_ids: Vec = serde_json::from_str( + reader + .schema() + .metadata + .get("partitions") + .expect("partitions missing from metadata"), + )?; + assert_eq!(partition_ids.len(), 1); + let partition_id = partition_ids[0]; + + store + .open_index_file(&staged_partition_file_path(partition_id, TOKENS_FILE)) + .await?; + assert!( + store + .open_index_file(&partition_file_path(partition_id, METADATA_FILE)) + .await + .is_err(), + "distributed build-only metadata should not be written to root partition metadata paths" + ); + assert!( + store + .open_index_file(&token_file_path(partition_id)) + .await + .is_err(), + "distributed build-only data should not be written to final partition paths" + ); + + Ok(()) + } + + #[tokio::test] + async fn test_merge_index_files_is_noop_when_metadata_exists() -> Result<()> { + let index_dir = TempDir::default(); + let object_store = Arc::new(ObjectStore::local()); + let store: Arc = Arc::new(LanceIndexStore::new( + object_store.clone(), + index_dir.obj_path(), + Arc::new(LanceCache::no_cache()), + )); + let metadata_builder = InvertedIndexBuilder::from_existing_index( + InvertedIndexParams::default(), + None, + vec![42], + TokenSetFormat::default(), + None, + RoaringBitmap::new(), + ); + metadata_builder + .write_metadata(store.as_ref(), &[42]) + .await?; + + merge_index_files( + object_store.as_ref(), + &index_dir.obj_path(), + store, + noop_progress(), + ) + .await?; + + Ok(()) + } + #[tokio::test] async fn test_build_only_path_writes_partitions_as_is() -> Result<()> { let src_dir = TempDir::default(); @@ -2856,7 +3582,6 @@ mod tests { } }) .collect::>(); - let read_start = tags .iter() .position(|e| e == "start:read_partition_metadata") @@ -2894,8 +3619,8 @@ mod tests { ); assert_eq!( remap_progress.last().copied().unwrap_or_default(), - 12, - "expected remap_partition_files progress to cover both rename phases" + 6, + "expected remap_partition_files progress to cover staged-to-final copies" ); assert!( tags.iter().any(|e| e == "progress:write_merged_metadata"), diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 92379a9e350..8e662f5db6f 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -6460,6 +6460,16 @@ mod tests { ) -> Result { self.inner.copy_index_file(name, dest_store).await } + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + self.inner + .copy_index_file_to(name, new_name, dest_store) + .await + } async fn rename_index_file( &self, name: &str, diff --git a/rust/lance-index/src/scalar/lance_format.rs b/rust/lance-index/src/scalar/lance_format.rs index 562945b8f0d..2f82deb8403 100644 --- a/rust/lance-index/src/scalar/lance_format.rs +++ b/rust/lance-index/src/scalar/lance_format.rs @@ -99,6 +99,24 @@ impl LanceIndexStore { self.file_sizes = file_sizes; self } + + fn index_file_path(&self, name: &str) -> Result { + let relative_path = Path::parse(name).map_err(|err| { + Error::invalid_input(format!("invalid index file path {name:?}: {err}")) + })?; + if self.index_dir.is_root() { + return Ok(relative_path); + } + if relative_path.is_root() { + return Ok(self.index_dir.clone()); + } + Path::parse(format!( + "{}/{}", + self.index_dir.as_ref(), + relative_path.as_ref() + )) + .map_err(|err| Error::invalid_input(format!("invalid index file path {name:?}: {err}"))) + } } #[async_trait] @@ -397,7 +415,7 @@ impl IndexStore for LanceIndexStore { name: &str, schema: Arc, ) -> Result> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; let schema = schema.as_ref().try_into()?; let writer = self.object_store.create(&path).await?; let writer = current_writer::FileWriter::try_new( @@ -415,7 +433,7 @@ impl IndexStore for LanceIndexStore { } async fn open_index_file(&self, name: &str) -> Result> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; // Use cached file size if available, otherwise unknown (requires HEAD call) let cached_size = self .file_sizes @@ -436,7 +454,7 @@ impl IndexStore for LanceIndexStore { Err(e) => { // If the error is a version conflict we can try to read the file with v1 reader if let Error::VersionConflict { .. } = e { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; let file_reader = PreviousFileReader::try_new_self_described( &self.object_store, &path, @@ -452,7 +470,16 @@ impl IndexStore for LanceIndexStore { } async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result { - let path = self.index_dir.clone().join(name); + self.copy_index_file_to(name, name, dest_store).await + } + + async fn copy_index_file_to( + &self, + name: &str, + new_name: &str, + dest_store: &dyn IndexStore, + ) -> Result { + let path = self.index_file_path(name)?; let other_store = dest_store.as_any().downcast_ref::(); match other_store { @@ -460,21 +487,21 @@ impl IndexStore for LanceIndexStore { // If both this store and the destination are lance stores we can use object_store's copy // This does blindly assume that both stores are using the same underlying object_store // but there is no easy way to verify this and it happens to always be true at the moment - let dest_path = dest_store.index_dir.clone().join(name); + let dest_path = dest_store.index_file_path(new_name)?; self.object_store.copy(&path, &dest_path).await?; let size_bytes = match self.file_sizes.get(name) { Some(size_bytes) => *size_bytes, None => self.object_store.size(&path).await?, }; Ok(IndexFile { - path: name.to_string(), + path: new_name.to_string(), size_bytes, }) } _ => { let reader = self.open_index_file(name).await?; let mut writer = dest_store - .new_index_file(name, Arc::new(reader.schema().into())) + .new_index_file(new_name, Arc::new(reader.schema().into())) .await?; for offset in (0..reader.num_rows()).step_by(4096) { @@ -488,8 +515,8 @@ impl IndexStore for LanceIndexStore { } async fn rename_index_file(&self, name: &str, new_name: &str) -> Result { - let path = self.index_dir.clone().join(name); - let new_path = self.index_dir.clone().join(new_name); + let path = self.index_file_path(name)?; + let new_path = self.index_file_path(new_name)?; self.object_store.copy(&path, &new_path).await?; self.object_store.delete(&path).await?; let size_bytes = match self.file_sizes.get(name) { @@ -503,7 +530,7 @@ impl IndexStore for LanceIndexStore { } async fn delete_index_file(&self, name: &str) -> Result<()> { - let path = self.index_dir.clone().join(name); + let path = self.index_file_path(name)?; self.object_store.delete(&path).await } diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 78e5c429527..69acc69b6da 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -57,7 +57,7 @@ use lance_io::utils::{ read_version, }; use lance_table::format::{Fragment, SelfDescribingFileReader}; -use lance_table::format::{IndexMetadata, list_index_files_with_sizes}; +use lance_table::format::{IndexFile, IndexMetadata, list_index_files_with_sizes}; use lance_table::io::manifest::read_manifest_indexes; use roaring::RoaringBitmap; use scalar::index_matches_criteria; @@ -166,7 +166,8 @@ pub(crate) async fn build_index_metadata_from_segments( let mut new_indices = Vec::with_capacity(segments.len()); for segment in segments { let (uuid, fragment_bitmap, index_details, index_version) = segment.into_parts(); - if index_details.type_url.ends_with("InvertedIndexDetails") { + let is_inverted_index = index_details.type_url.ends_with("InvertedIndexDetails"); + if is_inverted_index { let metadata = IndexMetadata { uuid, name: index_name.to_string(), @@ -183,7 +184,10 @@ pub(crate) async fn build_index_metadata_from_segments( .await?; } let index_dir = dataset.indices_dir().clone().join(uuid.to_string()); - let files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + let mut files = list_index_files_with_sizes(&dataset.object_store, &index_dir).await?; + if is_inverted_index { + retain_committed_inverted_files(&mut files); + } new_indices.push(IndexMetadata { uuid, name: index_name.to_string(), @@ -201,6 +205,10 @@ pub(crate) async fn build_index_metadata_from_segments( Ok(new_indices) } +fn retain_committed_inverted_files(files: &mut Vec) { + files.retain(|file| !file.path.starts_with("staging/")); +} + fn validate_segment_index_details(index_name: &str, segments: &[IndexMetadata]) -> Result<()> { let mut type_url = None::<&str>; for segment in segments { diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 2b6992e4849..bbb055463dc 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -1993,6 +1993,23 @@ mod tests { let segments = input_segments.clone(); assert_eq!(segments.len(), input_segments.len()); + crate::index::scalar::inverted::finalize_segment_files_if_needed( + &dataset, + &input_segments[0], + ) + .await + .unwrap(); + let stale_staging_path = dataset + .indices_dir() + .join(input_segments[0].uuid.to_string()) + .join("staging") + .join("orphan.lance"); + dataset + .object_store + .put(&stale_staging_path, b"stale") + .await + .unwrap(); + dataset .commit_existing_index_segments("text_idx", "text", segments) .await @@ -2016,6 +2033,19 @@ mod tests { let indices = dataset.load_indices_by_name("text_idx").await.unwrap(); assert_eq!(indices.len(), input_segments.len()); + let finalized_segment = indices + .iter() + .find(|index| index.uuid == input_segments[0].uuid) + .expect("finalized segment should be committed"); + assert!( + finalized_segment + .files + .as_ref() + .expect("committed segment should track files") + .iter() + .all(|file| !file.path.starts_with("staging/")), + "stale staging files must not be committed in IndexMetadata.files" + ); } #[tokio::test] From 0362c198424aa2fdd3fbcca2df4523a055e4a0a1 Mon Sep 17 00:00:00 2001 From: Muhammad Aiman <84276911+aimanmalib@users.noreply.github.com> Date: Tue, 16 Jun 2026 23:59:43 +0800 Subject: [PATCH 109/177] feat: expose io_buffer_size in CompactionOptions (#7226) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `CompactionOptions` did not expose `io_buffer_size`, even though the scanner used during compaction supports it. Compaction builds its scan reader in `prepare_reader`, which only forwarded `batch_size` to the scanner — the `io_buffer_size` knob was never set. This matters because a single batch larger than the I/O buffer size causes the scanner to **deadlock** (documented on `Scanner::io_buffer_size`), and since the backpressure warning was downgraded to `debug`, this deadlock is now silent at the default. Users had no way to raise the buffer to avoid it during compaction. Resolves #4946. ## Changes - Add `io_buffer_size: Option` to `CompactionOptions` (with `Default` = `None`). - Support the `lance.compaction.io_buffer_size` manifest config key in `apply_dataset_config`. - Plumb the value through `prepare_reader` → `scanner.io_buffer_size(...)`, mirroring the existing `batch_size` handling. - Update the Python binding to keep parameter names consistent across languages: - `parse_compaction_options` accepts `io_buffer_size` - `CompactionOptions` TypedDict documents the field ## Testing - Extended `test_from_dataset_config` to assert the `lance.compaction.io_buffer_size` key round-trips. - Added `test_compact_with_io_buffer_size` (parametrized over `Legacy`/`Stable` file versions) that runs `compact_files` with an explicit `io_buffer_size` and verifies the compaction succeeds and preserves all rows. - `cargo test -p lance --lib dataset::optimize::tests` → **76 passed, 0 failed**. - `cargo clippy -p lance --lib --tests` → clean. - `cargo fmt --all -- --check` → clean. ## Notes This is a non-breaking, additive change — the new field defaults to `None`, preserving existing behavior when unset. --- python/python/lance/optimize.py | 8 +++ python/src/dataset/optimize.rs | 3 ++ rust/lance/src/dataset/optimize.rs | 80 ++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) diff --git a/python/python/lance/optimize.py b/python/python/lance/optimize.py index 8b98308d442..3ac7547960b 100644 --- a/python/python/lance/optimize.py +++ b/python/python/lance/optimize.py @@ -57,6 +57,14 @@ class CompactionOptions(TypedDict): The batch size to use when scanning input fragments. You may want to reduce this if you are running out of memory during compaction. + The default will use the same default from ``scanner``. + """ + io_buffer_size: Optional[int] + """ + The number of bytes to allow to queue up in the I/O buffer when scanning + input fragments. Increasing this can avoid a deadlock that occurs when a + single batch of data is larger than the I/O buffer size. + The default will use the same default from ``scanner``. """ compaction_mode: Optional[ diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs index 321d7157b86..33aa32b94cd 100644 --- a/python/src/dataset/optimize.rs +++ b/python/src/dataset/optimize.rs @@ -58,6 +58,9 @@ fn parse_compaction_options( "batch_size" => { opts.batch_size = value.extract()?; } + "io_buffer_size" => { + opts.io_buffer_size = value.extract()?; + } "compaction_mode" => { let mode_str: Option = value.extract()?; if let Some(mode_str) = mode_str { diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index d591e42cc73..56cf74c1a62 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -191,6 +191,13 @@ pub struct CompactionOptions { /// specified then the default (see /// [`crate::dataset::Scanner::batch_size`]) will be used. pub batch_size: Option, + /// The number of bytes to allow to queue up in the I/O buffer when scanning + /// the input fragments. If not specified then the default (see + /// [`crate::dataset::Scanner::io_buffer_size`]) will be used. + /// + /// Increasing this can avoid a deadlock that occurs when a single batch of + /// data is larger than the I/O buffer size. + pub io_buffer_size: Option, /// Whether to defer remapping indices during compaction. If true, indices will /// not be remapped during this compaction operation. Instead, the fragment reuse index /// is updated and will be used to perform remapping later. @@ -237,6 +244,7 @@ impl Default for CompactionOptions { num_threads: None, max_bytes_per_file: None, batch_size: None, + io_buffer_size: None, defer_index_remap: false, compaction_mode: None, enable_binary_copy: false, @@ -264,6 +272,7 @@ impl CompactionOptions { /// - `lance.compaction.materialize_deletions_threshold` /// - `lance.compaction.defer_index_remap` /// - `lance.compaction.batch_size` + /// - `lance.compaction.io_buffer_size` /// - `lance.compaction.compaction_mode` /// - `lance.compaction.binary_copy_read_batch_bytes` /// - `lance.compaction.max_source_fragments` @@ -347,6 +356,14 @@ impl CompactionOptions { )) })?); } + "io_buffer_size" => { + self.io_buffer_size = Some(value.parse().map_err(|_| { + Error::invalid_input(format!( + "Invalid value for {}: '{}' (expected a non-negative integer)", + key, value + )) + })?); + } "compaction_mode" => { self.compaction_mode = Some(CompactionMode::try_from(value.as_str())?); } @@ -1194,6 +1211,8 @@ async fn transform_blob_v2_batch( /// and preserve insertion order. /// - `batch_size`: Optional batch size; if provided, set it on the scanner to control /// read batching. +/// - `io_buffer_size`: Optional I/O buffer size in bytes; if provided, set it on the +/// scanner to control how much data is queued during reads. /// - `with_frags`: Whether to scan only the specified old fragments and force /// in-order reading. /// - `capture_row_ids`: When index remapping is needed, include and capture the @@ -1209,6 +1228,7 @@ async fn prepare_reader( dataset: &Dataset, fragments: &[Fragment], batch_size: Option, + io_buffer_size: Option, with_frags: bool, capture_row_ids: bool, ) -> Result<( @@ -1234,6 +1254,9 @@ async fn prepare_reader( if let Some(bs) = batch_size { scanner.batch_size(bs); } + if let Some(io_buffer_size) = io_buffer_size { + scanner.io_buffer_size(io_buffer_size); + } if with_frags { scanner .with_fragments(fragments.to_vec()) @@ -1515,6 +1538,7 @@ async fn rewrite_files( dataset.as_ref(), &fragments, options.batch_size, + options.io_buffer_size, true, needs_remapping, ) @@ -2636,6 +2660,57 @@ mod tests { assert_eq!(scanned_data, data); } + #[rstest] + #[tokio::test] + async fn test_compact_with_io_buffer_size( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) { + // Compaction should succeed and produce correct results when an + // explicit io_buffer_size is provided via CompactionOptions. + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + + let data = sample_data(); + + // Create a table with 2 small fragments so there is something to compact. + let reader = RecordBatchIterator::new(vec![Ok(data.clone())], data.schema()); + let write_params = WriteParams { + max_rows_per_file: 5_000, + max_rows_per_group: 1_000, + data_storage_version: Some(data_storage_version), + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let options = CompactionOptions { + // A generous buffer so the read does not deadlock on large batches. + io_buffer_size: Some(256 * 1024 * 1024), + ..Default::default() + }; + let plan = plan_compaction(&dataset, &options).await.unwrap(); + assert_eq!(plan.tasks().len(), 1); + + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert_eq!(metrics.fragments_removed, 2); + assert_eq!(metrics.fragments_added, 1); + + // All rows are preserved after compaction. + let scanner = dataset.scan(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let scanned_data = concat_batches(&batches[0].schema(), &batches).unwrap(); + assert_eq!(scanned_data.num_rows(), data.num_rows()); + } + #[rstest] #[tokio::test] async fn test_compact_deletions( @@ -4683,6 +4758,10 @@ mod tests { "lance.compaction.batch_size".to_string(), "4096".to_string(), ), + ( + "lance.compaction.io_buffer_size".to_string(), + "1073741824".to_string(), + ), ( "lance.compaction.compaction_mode".to_string(), "try_binary_copy".to_string(), @@ -4701,6 +4780,7 @@ mod tests { assert!((opts.materialize_deletions_threshold - 0.25).abs() < f32::EPSILON); assert!(opts.defer_index_remap); assert_eq!(opts.batch_size, Some(4096)); + assert_eq!(opts.io_buffer_size, Some(1_073_741_824)); assert_eq!(opts.compaction_mode, Some(CompactionMode::TryBinaryCopy)); assert_eq!(opts.binary_copy_read_batch_bytes, Some(8_388_608)); } From 464f0bc2932dfc825c0b9cc5641e094414efe0db Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Wed, 17 Jun 2026 00:15:16 +0800 Subject: [PATCH 110/177] perf(vector): vectorize RaBitQ top-k lower-bound pruning scan (#7243) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What The per-partition top-k scan of multi-bit IVF_RQ search (`accumulate_raw_query_multi_bit_topk_with_scratch`, Normal/Accurate modes) walked all `n` rows with a scalar lower-bound computation — per row: 4 bounds-checked loads, ~5 FLOPs, two compares, iterator and `Option` plumbing — even though ~99.9% of rows are pruned once the heap is tight. On dbpedia-openai-1M (1536d, num_bits=5, nprobes=24, k=10) this loop profiled at ~10–13% of query self time. This PR vectorizes the classification: - **Dense path** (`accumulate_topk_with_scratch`, rows `0..n`): new `bq::prune` kernels evaluate the lower bound and both pruning compares for 16 rows per call, returning bit masks. Mask-zero groups (the common case) are skipped whole; surviving lanes run the existing scalar rerank with live values. The `row_id` mapping is now only invoked for rows that reach the scalar tail, and no scratch buffer is needed — everything stays in registers. - **Sparse path** (prefiltered `accumulate_filtered_topk_with_scratch`): unchanged scalar loop. Kernels follow the `ex_dot` dispatch pattern: `#[target_feature]` AVX-512 and AVX2 implementations behind a `LazyLock` runtime-dispatched fn pointer, with a portable 16-wide fallback that LLVM auto-vectorizes (NEON is baseline on aarch64). `accumulate_distances_into_heap` (Fast-mode bypass) is left as is: it has no factor arrays and would need a separate kernel shape, and Fast mode doesn't take the gated path. ## Correctness The dense path is bit-identical to the scalar implementation, not just statistically equivalent: - The kernels keep the scalar operation order (multiplies and adds, **no FMA**), so the lower bounds match `raw_query_lower_bound` bit for bit, and comparisons use ordered-quiet GE (`_CMP_GE_OQ`) matching scalar `>=` (a NaN lower bound is never pruned). - The heap threshold snapshot taken at each 16-row group start can be stale, but the threshold only ever tightens, so the masks can only over-select survivors — and survivors are re-checked per row against live values. - Heap contents, processing order, and the `LANCE_RQ_PRUNE_STATS` counters are unchanged. ## Tests - `vector::bq::prune`: every available kernel (portable, AVX2, AVX-512, dispatched) against a per-lane scalar reference on random inputs, exact `>=` boundary ties, and NaN/±inf semantics. - `test_raw_query_multi_bit_topk_dense_matches_sparse`: differential test of the dense path against the unchanged sparse scalar path with crafted factor columns controlling lower bounds and exact distances — n ∈ {1, 15, 16, 17, 100, 4109} × k ∈ {1, 10, n+7} × bounds, distance orderings descending (constant heap churn), ascending (mass pruning), random, duplicates, and exact ties, with a second pass on the shared heap (the carried tight-threshold regime). Asserts identical heap contents (row ids + distance bit patterns) and the k-smallest-distances reference. - `cargo test -p lance-index --lib vector::bq` and `cargo test -p lance ivf_rq` pass on aarch64 (portable kernel) and on x86_64 with AVX-512 (GCP c4-standard-16). ## Benchmark New `RQ heap topk` bench (binary FastScan + pruning scan + exact rerank; 4096 rows, k=10, DIM=1536, num_bits=5, error factors present so gating is enabled), GCP c4-standard-16 (AVX-512), pinned core: | mode | before | after | change | |---|---|---|---| | normal | 70.7 µs | 64.9 µs | **−9.4%** (p = 0.00) | | accurate | 93.4 µs | 87.9 µs | **−5.7%** (p = 0.00) | The binary FastScan portion of the bench is unchanged; the delta is the pruning scan itself, matching the ~5–9% end-to-end win predicted from the profile. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Fable 5 --- rust/lance-index/benches/rq.rs | 131 +++++- rust/lance-index/src/vector/bq.rs | 1 + rust/lance-index/src/vector/bq/prune.rs | 527 ++++++++++++++++++++++ rust/lance-index/src/vector/bq/storage.rs | 511 ++++++++++++++++++--- 4 files changed, 1104 insertions(+), 66 deletions(-) create mode 100644 rust/lance-index/src/vector/bq/prune.rs diff --git a/rust/lance-index/benches/rq.rs b/rust/lance-index/benches/rq.rs index e29ce9c4695..72e0c49820d 100644 --- a/rust/lance-index/benches/rq.rs +++ b/rust/lance-index/benches/rq.rs @@ -512,9 +512,138 @@ fn ex_bulk_paths(c: &mut Criterion) { } } +/// Top-k accumulation through the gated raw-query multi-bit path: binary +/// FastScan, the per-row lower-bound pruning scan, and the exact rerank of +/// the surviving rows. Error factors are present so the gating is enabled. +fn heap_topk(c: &mut Criterion) { + use arrow_array::{ArrayRef, FixedSizeListArray, Float32Array, UInt8Array, UInt64Array}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ApproxMode; + use lance_index::vector::bq::transform::{ + ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, + }; + use lance_index::vector::storage::DistanceCalculatorOptions; + use std::collections::BinaryHeap; + use std::sync::Arc; + + const TOPK_DIM: usize = 1536; + const TOPK_ROWS: usize = 4096; + const TOPK_K: usize = 10; + const NUM_BITS: u8 = 5; + let ex_bits = NUM_BITS - 1; + + let mut rng = SmallRng::seed_from_u64(99); + let rq = RabitQuantizer::new_with_rotation::( + NUM_BITS, + TOPK_DIM as i32, + RQRotationType::Fast, + ); + let metadata = rq.metadata(None); + + let code_len = TOPK_DIM / 8; + let binary_codes = (0..TOPK_ROWS * code_len) + .map(|_| rng.random()) + .collect::>(); + let ex_code_len = blocked_ex_code_bytes(TOPK_DIM, ex_bits); + let ex_codes = (0..TOPK_ROWS * ex_code_len) + .map(|_| rng.random()) + .collect::>(); + // Factor magnitudes chosen so the lower bounds spread mostly with the add + // factors; once the heap is full the threshold prunes the vast majority + // of rows, like a production multi-partition scan. + let mut rand_factors = |low: f32, high: f32| { + Arc::new(Float32Array::from( + (0..TOPK_ROWS) + .map(|_| rng.random_range(low..high)) + .collect::>(), + )) as ArrayRef + }; + let batch = arrow_array::RecordBatch::try_from_iter(vec![ + ( + ROW_ID, + Arc::new(UInt64Array::from_iter_values(0..TOPK_ROWS as u64)) as ArrayRef, + ), + ( + RABIT_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(binary_codes), + code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (SCALE_FACTORS_COLUMN, rand_factors(0.0005, 0.0015)), + (ERROR_FACTORS_COLUMN, rand_factors(0.0, 0.01)), + ( + RABIT_BLOCKED_EX_CODE_COLUMN, + Arc::new( + FixedSizeListArray::try_new_from_values( + UInt8Array::from(ex_codes), + ex_code_len as i32, + ) + .unwrap(), + ) as ArrayRef, + ), + (EX_ADD_FACTORS_COLUMN, rand_factors(0.0, 1.0)), + (EX_SCALE_FACTORS_COLUMN, rand_factors(0.00003, 0.0001)), + ]) + .unwrap(); + let storage = + RabitQuantizationStorage::try_from_batch(batch, &metadata, DistanceType::L2, None).unwrap(); + let query: ArrayRef = Arc::new(Float32Array::from( + (0..TOPK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect::>(), + )); + + for (label, approx_mode) in [ + ("normal", ApproxMode::Normal), + ("accurate", ApproxMode::Accurate), + ] { + let mut f32_scratch = Vec::new(); + let calc = storage.dist_calculator_with_scratch( + query.clone(), + 1.0, + None, + &mut f32_scratch, + DistanceCalculatorOptions { approx_mode }, + ); + let mut heap = BinaryHeap::with_capacity(TOPK_K + 1); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + c.bench_function( + format!( + "RQ heap topk ({label}): num_bits={NUM_BITS}, DIM={TOPK_DIM}, rows={TOPK_ROWS}, k={TOPK_K}" + ) + .as_str(), + |b| { + b.iter(|| { + heap.clear(); + calc.accumulate_topk_with_scratch( + TOPK_K, + None, + None, + |id| id as u64, + &mut heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + black_box(heap.len()) + }) + }, + ); + } +} + criterion_group!( name=benches; config = Criterion::default().measurement_time(Duration::from_secs(10)); - targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths); + targets = construct_dist_table, compute_distances, ex_dot_kernels, ex_code_storage_load, ex_bulk_paths, heap_topk); criterion_main!(benches); diff --git a/rust/lance-index/src/vector/bq.rs b/rust/lance-index/src/vector/bq.rs index ad013683214..7a47fa88d54 100644 --- a/rust/lance-index/src/vector/bq.rs +++ b/rust/lance-index/src/vector/bq.rs @@ -20,6 +20,7 @@ use crate::vector::quantizer::QuantizerBuildParams; pub mod builder; pub(crate) mod dist_table_quant; pub mod ex_dot; +pub mod prune; pub mod rotation; pub mod storage; pub mod transform; diff --git a/rust/lance-index/src/vector/bq/prune.rs b/rust/lance-index/src/vector/bq/prune.rs new file mode 100644 index 00000000000..e67ab6642b8 --- /dev/null +++ b/rust/lance-index/src/vector/bq/prune.rs @@ -0,0 +1,527 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! SIMD kernels for the RaBitQ top-k lower-bound pruning scan. +//! +//! Multi-bit IVF_RQ search gates the exact ex-code rerank with a per-row +//! distance lower bound: after the binary FastScan fills the per-row binary +//! inner products, every row of the partition is classified against the query +//! upper bound and the current top-k heap threshold, and only the survivors +//! (typically well under 1%) are reranked. The classification is the per-row +//! formula of `RabitDistCalculator::raw_query_lower_bound`: +//! +//! ```text +//! lower_bound = (binary_ip - 0.5 * sum_q) * scale_factor +//! + add_factor + query_factor +//! - error_factor * query_error +//! ``` +//! +//! These kernels evaluate the formula and both comparisons for +//! [`PRUNE_LANES`] rows at a time, returning bit masks instead of values so +//! the caller can skip whole groups (the overwhelmingly common case) and run +//! the existing scalar rerank only for the surviving lanes. +//! +//! Correctness contract: +//! +//! - The lower bound is computed with exactly the operation order of the +//! scalar helper — multiplies and adds, never FMA. A fused multiply-add +//! rounds differently, which could prune a row the scalar code would have +//! kept; with bit-identical lower bounds the masks reproduce the scalar +//! `>=` decisions exactly, keeping heap contents and prune-stats counters +//! unchanged. +//! - Comparisons use ordered-quiet GE predicates (`_CMP_GE_OQ`), matching +//! scalar `>=`: a NaN lower bound is never pruned and falls through to the +//! exact rerank. +//! - The heap threshold may be a stale snapshot (it only ever tightens); the +//! caller re-checks surviving lanes against live values, so a stale +//! threshold can only over-select survivors, never wrongly prune. + +use std::sync::LazyLock; + +/// Rows classified per kernel invocation. +pub const PRUNE_LANES: usize = 16; + +/// Per-query constants of the lower-bound formula, mirroring +/// `RabitDistCalculator::raw_query_lower_bound` term by term. +#[derive(Debug, Clone, Copy)] +pub struct LowerBoundTerms { + /// `0.5 * sum_q`, subtracted from the binary inner product. + pub half_sum_q: f32, + pub query_factor: f32, + pub query_error: f32, +} + +/// Classify [`PRUNE_LANES`] rows against the pruning bounds. +/// +/// Arguments are the per-row binary inner products, scale factors, add +/// factors, and error factors, followed by the formula constants, the query +/// upper bound, and the heap threshold (`None` while the heap is not full, +/// which disables the heap mask). +/// +/// Returns `(pruned_upper_bound, pruned_heap)` masks: bit `i` of +/// `pruned_upper_bound` is set when `lower_bound[i] >= upper_bound`, and bit +/// `i` of `pruned_heap` is set when the row is not already pruned by the +/// upper bound and `lower_bound[i] >= heap_threshold`. Surviving rows are the +/// zero bits of the OR of both masks. +pub type PruneMaskFn = fn( + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + &[f32; PRUNE_LANES], + LowerBoundTerms, + f32, + Option, +) -> (u16, u16); + +/// Resolve the prune-mask kernel for the running CPU once; the result can be +/// cached by the caller for per-partition use. +pub fn prune_mask_kernel() -> PruneMaskFn { + static KERNEL: LazyLock = LazyLock::new(select_prune_mask_kernel); + *KERNEL +} + +fn select_prune_mask_kernel() -> PruneMaskFn { + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx512f") { + return x86::prune_masks_avx512_dispatch; + } + if std::arch::is_x86_feature_detected!("avx2") { + return x86::prune_masks_avx2_dispatch; + } + } + // On aarch64 the plain 16-wide loop auto-vectorizes to NEON (part of the + // baseline), so no dedicated kernel is needed. + prune_masks_portable +} + +/// Portable implementation; also the reference for the SIMD kernels. +fn prune_masks_portable( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, +) -> (u16, u16) { + let mut lower_bounds = [0.0f32; PRUNE_LANES]; + for lane in 0..PRUNE_LANES { + lower_bounds[lane] = ((dists[lane] - terms.half_sum_q) * scale_factors[lane] + + add_factors[lane] + + terms.query_factor) + - error_factors[lane] * terms.query_error; + } + let mut pruned_upper_bound = 0u16; + for (lane, lower_bound) in lower_bounds.iter().enumerate() { + pruned_upper_bound |= u16::from(*lower_bound >= upper_bound) << lane; + } + let mut pruned_heap = 0u16; + if let Some(threshold) = heap_threshold { + for (lane, lower_bound) in lower_bounds.iter().enumerate() { + pruned_heap |= u16::from(*lower_bound >= threshold) << lane; + } + pruned_heap &= !pruned_upper_bound; + } + (pruned_upper_bound, pruned_heap) +} + +#[cfg(target_arch = "x86_64")] +mod x86 { + use super::{LowerBoundTerms, PRUNE_LANES}; + use std::arch::x86_64::*; + + /// Lower bounds for 8 lanes with the scalar operation order (no FMA). + #[inline] + #[target_feature(enable = "avx")] + fn lower_bounds_avx( + dists: __m256, + scale_factors: __m256, + add_factors: __m256, + error_factors: __m256, + half_sum_q: __m256, + query_factor: __m256, + query_error: __m256, + ) -> __m256 { + let binary_distance = _mm256_add_ps( + _mm256_add_ps( + _mm256_mul_ps(_mm256_sub_ps(dists, half_sum_q), scale_factors), + add_factors, + ), + query_factor, + ); + _mm256_sub_ps(binary_distance, _mm256_mul_ps(error_factors, query_error)) + } + + #[inline] + #[target_feature(enable = "avx")] + fn ge_mask_avx(lower_bounds_lo: __m256, lower_bounds_hi: __m256, bound: f32) -> u16 { + let bound = _mm256_set1_ps(bound); + let lo = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_lo, bound)); + let hi = _mm256_movemask_ps(_mm256_cmp_ps::<_CMP_GE_OQ>(lower_bounds_hi, bound)); + (lo | (hi << 8)) as u16 + } + + #[target_feature(enable = "avx2")] + unsafe fn prune_masks_avx2( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + let half_sum_q = _mm256_set1_ps(terms.half_sum_q); + let query_factor = _mm256_set1_ps(terms.query_factor); + let query_error = _mm256_set1_ps(terms.query_error); + // SAFETY: the array references guarantee 16 readable floats each. + let lower_bounds_lo = unsafe { + lower_bounds_avx( + _mm256_loadu_ps(dists.as_ptr()), + _mm256_loadu_ps(scale_factors.as_ptr()), + _mm256_loadu_ps(add_factors.as_ptr()), + _mm256_loadu_ps(error_factors.as_ptr()), + half_sum_q, + query_factor, + query_error, + ) + }; + let lower_bounds_hi = unsafe { + lower_bounds_avx( + _mm256_loadu_ps(dists.as_ptr().add(8)), + _mm256_loadu_ps(scale_factors.as_ptr().add(8)), + _mm256_loadu_ps(add_factors.as_ptr().add(8)), + _mm256_loadu_ps(error_factors.as_ptr().add(8)), + half_sum_q, + query_factor, + query_error, + ) + }; + let pruned_upper_bound = ge_mask_avx(lower_bounds_lo, lower_bounds_hi, upper_bound); + let pruned_heap = match heap_threshold { + Some(threshold) => { + ge_mask_avx(lower_bounds_lo, lower_bounds_hi, threshold) & !pruned_upper_bound + } + None => 0, + }; + (pruned_upper_bound, pruned_heap) + } + + pub(super) fn prune_masks_avx2_dispatch( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: only selected when AVX2 was detected. + unsafe { + prune_masks_avx2( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ) + } + } + + #[target_feature(enable = "avx512f")] + unsafe fn prune_masks_avx512( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: the array references guarantee 16 readable floats each. + let (dists, scale_factors, add_factors, error_factors) = unsafe { + ( + _mm512_loadu_ps(dists.as_ptr()), + _mm512_loadu_ps(scale_factors.as_ptr()), + _mm512_loadu_ps(add_factors.as_ptr()), + _mm512_loadu_ps(error_factors.as_ptr()), + ) + }; + let binary_distance = _mm512_add_ps( + _mm512_add_ps( + _mm512_mul_ps( + _mm512_sub_ps(dists, _mm512_set1_ps(terms.half_sum_q)), + scale_factors, + ), + add_factors, + ), + _mm512_set1_ps(terms.query_factor), + ); + let lower_bounds = _mm512_sub_ps( + binary_distance, + _mm512_mul_ps(error_factors, _mm512_set1_ps(terms.query_error)), + ); + let pruned_upper_bound = + _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(upper_bound)); + let pruned_heap = match heap_threshold { + Some(threshold) => { + _mm512_cmp_ps_mask::<_CMP_GE_OQ>(lower_bounds, _mm512_set1_ps(threshold)) + & !pruned_upper_bound + } + None => 0, + }; + (pruned_upper_bound, pruned_heap) + } + + pub(super) fn prune_masks_avx512_dispatch( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + // SAFETY: only selected when AVX-512F was detected. + unsafe { + prune_masks_avx512( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + fn available_kernels() -> Vec<(&'static str, PruneMaskFn)> { + // `mut` is only exercised on x86_64 where extra kernels may be pushed. + #[allow(unused_mut)] + let mut kernels = vec![ + ("portable", prune_masks_portable as PruneMaskFn), + ("dispatched", prune_mask_kernel()), + ]; + #[cfg(target_arch = "x86_64")] + { + if std::arch::is_x86_feature_detected!("avx2") { + kernels.push(("avx2", x86::prune_masks_avx2_dispatch)); + } + if std::arch::is_x86_feature_detected!("avx512f") { + kernels.push(("avx512", x86::prune_masks_avx512_dispatch)); + } + } + kernels + } + + /// Per-lane reference mirroring `raw_query_lower_bound` and the scalar + /// pruning checks of the top-k scan. + fn reference_masks( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + ) -> (u16, u16) { + let mut pruned_upper_bound = 0u16; + let mut pruned_heap = 0u16; + for lane in 0..PRUNE_LANES { + let lower_bound = (dists[lane] - terms.half_sum_q) * scale_factors[lane] + + add_factors[lane] + + terms.query_factor + - error_factors[lane] * terms.query_error; + if lower_bound >= upper_bound { + pruned_upper_bound |= 1 << lane; + } else if heap_threshold.is_some_and(|threshold| lower_bound >= threshold) { + pruned_heap |= 1 << lane; + } + } + (pruned_upper_bound, pruned_heap) + } + + #[allow(clippy::too_many_arguments)] + fn assert_kernels_match_reference( + dists: &[f32; PRUNE_LANES], + scale_factors: &[f32; PRUNE_LANES], + add_factors: &[f32; PRUNE_LANES], + error_factors: &[f32; PRUNE_LANES], + terms: LowerBoundTerms, + upper_bound: f32, + heap_threshold: Option, + case: &str, + ) { + let expected = reference_masks( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ); + for (name, kernel) in available_kernels() { + let actual = kernel( + dists, + scale_factors, + add_factors, + error_factors, + terms, + upper_bound, + heap_threshold, + ); + assert_eq!( + actual, expected, + "kernel={name} case={case}: masks {actual:04x?} != {expected:04x?}" + ); + } + } + + #[test] + fn test_prune_masks_match_reference_on_random_inputs() { + let mut rng = SmallRng::seed_from_u64(42); + for round in 0..200 { + let mut dists = [0.0f32; PRUNE_LANES]; + let mut scale_factors = [0.0f32; PRUNE_LANES]; + let mut add_factors = [0.0f32; PRUNE_LANES]; + let mut error_factors = [0.0f32; PRUNE_LANES]; + for lane in 0..PRUNE_LANES { + dists[lane] = rng.random_range(-100.0f32..100.0); + scale_factors[lane] = rng.random_range(-2.0f32..2.0); + add_factors[lane] = rng.random_range(-10.0f32..10.0); + error_factors[lane] = rng.random_range(0.0f32..5.0); + } + let terms = LowerBoundTerms { + half_sum_q: rng.random_range(-50.0f32..50.0), + query_factor: rng.random_range(-10.0f32..10.0), + query_error: rng.random_range(0.0f32..2.0), + }; + let upper_bound = rng.random_range(-50.0f32..50.0); + let heap_threshold = if round % 3 == 0 { + None + } else { + Some(rng.random_range(-50.0f32..50.0)) + }; + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + upper_bound, + heap_threshold, + &format!("random round {round}"), + ); + } + } + + #[test] + fn test_prune_masks_exact_boundaries() { + // With scale=1, err=0, half_sum_q=0, query_factor=0 the lower bound + // is the input itself, so bounds can be placed exactly on lanes. + let dists: [f32; PRUNE_LANES] = std::array::from_fn(|lane| lane as f32); + let scale_factors = [1.0f32; PRUNE_LANES]; + let add_factors = [0.0f32; PRUNE_LANES]; + let error_factors = [0.0f32; PRUNE_LANES]; + let terms = LowerBoundTerms { + half_sum_q: 0.0, + query_factor: 0.0, + query_error: 1.0, + }; + // Equality must prune (scalar uses `>=`): lanes 3.. hit the upper + // bound, lanes 1..3 hit only the heap threshold. + let (pruned_upper_bound, pruned_heap) = prune_masks_portable( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + Some(1.0), + ); + assert_eq!(pruned_upper_bound, 0xfff8); + assert_eq!(pruned_heap, 0x0006); + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + Some(1.0), + "exact boundaries", + ); + // No heap threshold: only the upper-bound mask is set. + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 3.0, + None, + "no heap threshold", + ); + } + + #[test] + fn test_prune_masks_nan_and_infinity_semantics() { + let mut dists = [0.0f32; PRUNE_LANES]; + dists[0] = f32::NAN; + dists[1] = f32::INFINITY; + dists[2] = f32::NEG_INFINITY; + dists[3] = 1.0; + let mut scale_factors = [1.0f32; PRUNE_LANES]; + scale_factors[4] = f32::NAN; + let add_factors = [0.0f32; PRUNE_LANES]; + let mut error_factors = [0.0f32; PRUNE_LANES]; + error_factors[5] = f32::INFINITY; + let terms = LowerBoundTerms { + half_sum_q: 0.0, + query_factor: 0.0, + query_error: 1.0, + }; + for (upper_bound, heap_threshold) in [ + (0.5, Some(0.0)), + (f32::INFINITY, Some(f32::NEG_INFINITY)), + (f32::NAN, Some(f32::NAN)), + (0.5, None), + ] { + assert_kernels_match_reference( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + upper_bound, + heap_threshold, + &format!("special values ub={upper_bound} thr={heap_threshold:?}"), + ); + } + // NaN lower bounds (lane 0 via a NaN binary inner product, lane 4 via + // a NaN scale factor) must never be pruned by either mask. + let (pruned_upper_bound, pruned_heap) = prune_masks_portable( + &dists, + &scale_factors, + &add_factors, + &error_factors, + terms, + 0.5, + Some(0.0), + ); + assert_eq!(pruned_upper_bound & 0b1_0001, 0); + assert_eq!(pruned_heap & 0b1_0001, 0); + } +} diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index ef88500a7b7..9c355d26960 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -17,7 +17,7 @@ use arrow_array::{ use arrow_schema::{DataType, Field, SchemaRef}; use async_trait::async_trait; use bytes::{Bytes, BytesMut}; -use itertools::Itertools; +use itertools::{Itertools, izip}; use lance_arrow::{ArrowFloatType, FixedSizeListArrayExt, FloatArray, RecordBatchExt}; use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, ROW_ID, Result}; @@ -48,6 +48,7 @@ use crate::vector::bq::ex_dot::{ EX_DOT_BLOCK_DIMS, ExDotFn, blocked_ex_code_bytes, ex_dot_kernel, pad_query_into, padded_query_len, repack_sequential_row, sequential_matches_blocked, }; +use crate::vector::bq::prune::{LowerBoundTerms, PRUNE_LANES, prune_mask_kernel}; use crate::vector::bq::rotation::{apply_fast_rotation, apply_fast_rotation_in_place}; use crate::vector::bq::transform::{ ADD_FACTORS_COLUMN, ERROR_FACTORS_COLUMN, EX_ADD_FACTORS_COLUMN, EX_SCALE_FACTORS_COLUMN, @@ -136,16 +137,28 @@ fn emit_rabit_prune_stats(message: &str) { ); } -fn record_rabit_prune_stats( +/// Per-scan tallies of the raw-query lower-bound gating, reported through +/// `record_rabit_prune_stats`. +#[derive(Default)] +struct RabitPruneCounters { candidates: usize, pruned_upper_bound: usize, pruned_heap: usize, exact: usize, exact_rejected: usize, -) { +} + +fn record_rabit_prune_stats(counters: &RabitPruneCounters) { if !rabit_prune_stats_enabled() { return; } + let RabitPruneCounters { + candidates, + pruned_upper_bound, + pruned_heap, + exact, + exact_rejected, + } = *counters; let stats = RABIT_PRUNE_STATS.get_or_init(RabitPruneStats::default); let calls = stats.calls.fetch_add(1, Ordering::Relaxed) + 1; @@ -798,6 +811,20 @@ struct RabitDistCalculatorParts<'a> { approx_mode: ApproxMode, } +/// Loop-invariant inputs of the raw-query multi-bit top-k scans: the row +/// count, the resolved ex-code state for exact reranking, and the query +/// bounds. +struct RawQueryTopkContext<'a> { + n: usize, + k: usize, + ex_bits: u8, + ex_codes: &'a [u8], + ex_add_factors: &'a [f32], + ex_scale_factors: &'a [f32], + query_lower_bound: f32, + query_upper_bound: f32, +} + /// Pick the query slice the ex-dot kernels consume: the rotated query itself /// when the dim is block-aligned, otherwise a zero-padded copy. fn kernel_query<'a>(rotated_query: &'a [f32], padded: &'a [f32]) -> &'a [f32] { @@ -1212,26 +1239,27 @@ impl<'a> RabitDistCalculator<'a> { full_dot * ex_scale_factors[id] + ex_add_factors[id] + self.query_factor } + /// Compute the binary inner products into `dists` and resolve the inputs + /// shared by the raw-query multi-bit top-k scans. Returns `None` when the + /// partition has no rows. #[allow(clippy::too_many_arguments)] - fn accumulate_raw_query_multi_bit_topk_with_scratch( + fn raw_query_multi_bit_topk_context( &self, k: usize, lower_bound: Option, upper_bound: Option, - row_ids: impl Iterator, - res: &mut BinaryHeap>, dists: &mut Vec, quantized_dists: &mut Vec, quantized_dists_table: &mut Vec, hacc_quantized_dists: &mut Vec, - ) { + ) -> Option> { let code_len = rabit_binary_code_bytes(self.dim); let n = self.codes.len() / code_len; if n == 0 { dists.clear(); quantized_dists.clear(); hacc_quantized_dists.clear(); - return; + return None; } self.binary_distances_with_scratch( @@ -1243,74 +1271,233 @@ impl<'a> RabitDistCalculator<'a> { hacc_quantized_dists, ); - let ex_bits = self.num_bits - 1; - let ex_codes = self - .ex_codes - .expect("raw-query multi-bit RQ requires ex codes"); - let ex_add_factors = self - .ex_add_factors - .expect("raw-query multi-bit RQ requires ex add factors"); - let ex_scale_factors = self - .ex_scale_factors - .expect("raw-query multi-bit RQ requires ex scale factors"); - let query_lower_bound = lower_bound.unwrap_or(f32::MIN); - let query_upper_bound = upper_bound.unwrap_or(f32::MAX); + Some(RawQueryTopkContext { + n, + k, + ex_bits: self.num_bits - 1, + ex_codes: self + .ex_codes + .expect("raw-query multi-bit RQ requires ex codes"), + ex_add_factors: self + .ex_add_factors + .expect("raw-query multi-bit RQ requires ex add factors"), + ex_scale_factors: self + .ex_scale_factors + .expect("raw-query multi-bit RQ requires ex scale factors"), + query_lower_bound: lower_bound.unwrap_or(f32::MIN), + query_upper_bound: upper_bound.unwrap_or(f32::MAX), + }) + } + + /// Process one candidate row given its lower bound: the bound checks, + /// the exact rerank, and the heap update shared by the sparse scan and + /// the dense scan's surviving lanes and tail. + #[inline] + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_row( + &self, + ctx: &RawQueryTopkContext<'_>, + id: usize, + row_id: u64, + binary_ip: f32, + raw_lower_bound: f32, + res: &mut BinaryHeap>, + max_dist: &mut Option, + counters: &mut RabitPruneCounters, + ) { + if raw_lower_bound >= ctx.query_upper_bound { + counters.pruned_upper_bound += 1; + return; + } + if res.len() >= ctx.k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) { + counters.pruned_heap += 1; + return; + } + + counters.exact += 1; + let dist = self.raw_query_multi_bit_exact_distance( + id, + binary_ip, + ctx.ex_bits, + ctx.ex_codes, + ctx.ex_add_factors, + ctx.ex_scale_factors, + ); + if dist < ctx.query_lower_bound || dist >= ctx.query_upper_bound { + counters.exact_rejected += 1; + return; + } + let dist = OrderedFloat(dist); + if res.len() < ctx.k { + res.push(OrderedNode::new(row_id, dist)); + if res.len() == ctx.k { + *max_dist = res.peek().map(|node| node.dist); + } + } else if max_dist.is_some_and(|max_dist| max_dist > dist) { + res.pop(); + res.push(OrderedNode::new(row_id, dist)); + *max_dist = res.peek().map(|node| node.dist); + } + } + + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_topk_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_ids: impl Iterator, + res: &mut BinaryHeap>, + dists: &mut Vec, + quantized_dists: &mut Vec, + quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, + ) { + let Some(ctx) = self.raw_query_multi_bit_topk_context( + k, + lower_bound, + upper_bound, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ) else { + return; + }; let mut max_dist = res.peek().map(|node| node.dist); - let mut candidates = 0; - let mut pruned_upper_bound = 0; - let mut pruned_heap = 0; - let mut exact = 0; - let mut exact_rejected = 0; + let mut counters = RabitPruneCounters::default(); for (id, row_id) in row_ids { let Some(binary_ip) = dists.get(id).copied() else { continue; }; - candidates += 1; + counters.candidates += 1; let Some(raw_lower_bound) = self.raw_query_lower_bound(id, binary_ip) else { continue; }; - if raw_lower_bound >= query_upper_bound { - pruned_upper_bound += 1; - continue; - } - if res.len() >= k && max_dist.is_some_and(|max_dist| raw_lower_bound >= max_dist.0) { - pruned_heap += 1; - continue; + self.accumulate_raw_query_multi_bit_row( + &ctx, + id, + row_id, + binary_ip, + raw_lower_bound, + res, + &mut max_dist, + &mut counters, + ); + } + record_rabit_prune_stats(&counters); + } + + /// Top-k scan over all rows `0..n` in order: classify [`PRUNE_LANES`] + /// rows at a time with the SIMD lower-bound kernel and run the scalar + /// rerank only for the surviving lanes. + #[allow(clippy::too_many_arguments)] + fn accumulate_raw_query_multi_bit_topk_dense_with_scratch( + &self, + k: usize, + lower_bound: Option, + upper_bound: Option, + row_id: impl Fn(u32) -> u64, + res: &mut BinaryHeap>, + dists: &mut Vec, + quantized_dists: &mut Vec, + quantized_dists_table: &mut Vec, + hacc_quantized_dists: &mut Vec, + ) { + let Some(ctx) = self.raw_query_multi_bit_topk_context( + k, + lower_bound, + upper_bound, + dists, + quantized_dists, + quantized_dists_table, + hacc_quantized_dists, + ) else { + return; + }; + let dists = dists.as_slice(); + debug_assert_eq!(dists.len(), ctx.n); + let scale_factors = &self.scale_factors[..ctx.n]; + let add_factors = &self.add_factors[..ctx.n]; + let error_factors = &self + .error_factors + .expect("raw-query lower-bound gating requires error factors")[..ctx.n]; + // Same expression as `raw_query_lower_bound` with `error_factors` + // already resolved; the masks below match it bit for bit. + let lower_bound_of = |id: usize, binary_ip: f32| { + self.raw_query_binary_distance(id, binary_ip) - error_factors[id] * self.query_error + }; + let terms = LowerBoundTerms { + half_sum_q: 0.5 * self.sum_q, + query_factor: self.query_factor, + query_error: self.query_error, + }; + let prune_masks = prune_mask_kernel(); + let mut max_dist = res.peek().map(|node| node.dist); + let mut counters = RabitPruneCounters::default(); + + let (dist_groups, dist_tail) = dists.as_chunks::(); + let (scale_groups, _) = scale_factors.as_chunks::(); + let (add_groups, _) = add_factors.as_chunks::(); + let (error_groups, _) = error_factors.as_chunks::(); + for (group, (dist16, scale16, add16, error16)) in + izip!(dist_groups, scale_groups, add_groups, error_groups).enumerate() + { + counters.candidates += PRUNE_LANES; + // The heap threshold only ever tightens, so this group-start + // snapshot can only over-select survivors (which the per-row + // processing below re-checks against live values), never prune a + // row the scalar scan would have kept. + let heap_threshold = (res.len() >= ctx.k) + .then(|| max_dist.map(|max_dist| max_dist.0)) + .flatten(); + let (pruned_upper_bound, pruned_heap) = prune_masks( + dist16, + scale16, + add16, + error16, + terms, + ctx.query_upper_bound, + heap_threshold, + ); + counters.pruned_upper_bound += pruned_upper_bound.count_ones() as usize; + counters.pruned_heap += pruned_heap.count_ones() as usize; + let mut survivors = !(pruned_upper_bound | pruned_heap); + while survivors != 0 { + let lane = survivors.trailing_zeros() as usize; + survivors &= survivors - 1; + let id = group * PRUNE_LANES + lane; + let binary_ip = dists[id]; + self.accumulate_raw_query_multi_bit_row( + &ctx, + id, + row_id(id as u32), + binary_ip, + lower_bound_of(id, binary_ip), + res, + &mut max_dist, + &mut counters, + ); } + } - exact += 1; - let dist = self.raw_query_multi_bit_exact_distance( + let tail_start = ctx.n - dist_tail.len(); + for (offset, binary_ip) in dist_tail.iter().copied().enumerate() { + let id = tail_start + offset; + counters.candidates += 1; + self.accumulate_raw_query_multi_bit_row( + &ctx, id, + row_id(id as u32), binary_ip, - ex_bits, - ex_codes, - ex_add_factors, - ex_scale_factors, + lower_bound_of(id, binary_ip), + res, + &mut max_dist, + &mut counters, ); - if dist < query_lower_bound || dist >= query_upper_bound { - exact_rejected += 1; - continue; - } - let dist = OrderedFloat(dist); - if res.len() < k { - res.push(OrderedNode::new(row_id, dist)); - if res.len() == k { - max_dist = res.peek().map(|node| node.dist); - } - } else if max_dist.is_some_and(|max_dist| max_dist > dist) { - res.pop(); - res.push(OrderedNode::new(row_id, dist)); - max_dist = res.peek().map(|node| node.dist); - } } - record_rabit_prune_stats( - candidates, - pruned_upper_bound, - pruned_heap, - exact, - exact_rejected, - ); + record_rabit_prune_stats(&counters); } fn raw_query_lower_bound_gating_disabled_reason(&self) -> Option<&'static str> { @@ -1717,13 +1904,11 @@ impl DistCalculator for RabitDistCalculator<'_> { return; } - let code_len = rabit_binary_code_bytes(self.dim); - let n = self.codes.len() / code_len; - self.accumulate_raw_query_multi_bit_topk_with_scratch( + self.accumulate_raw_query_multi_bit_topk_dense_with_scratch( k, lower_bound, upper_bound, - (0..n).map(|id| (id, row_id(id as u32))), + row_id, res, dists, quantized_dists, @@ -2526,6 +2711,8 @@ mod tests { use arrow_array::{ArrayRef, Float32Array, Float64Array, UInt64Array}; use lance_core::ROW_ID; use lance_linalg::distance::DistanceType; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; use rstest::rstest; use crate::vector::bq::{RQRotationType, builder::RabitQuantizer}; @@ -3741,6 +3928,200 @@ mod tests { } } + /// Inputs crafted so the top-k scan outcomes are fully determined by the + /// factor columns: with zero scale factors, a zero query factor, and a + /// query error of one, the lower bound is + /// `add_factors[id] - error_factors[id]`, and with zero ex scale factors + /// the exact distance is `ex_add_factors[id]`, regardless of the random + /// codes and query. + struct CraftedTopkData { + codes: Vec, + ex_codes: Vec, + dist_table: Vec, + ex_query: Vec, + scale_factors: Vec, + add_factors: Vec, + error_factors: Vec, + ex_scale_factors: Vec, + ex_add_factors: Vec, + } + + const CRAFTED_TOPK_DIM: usize = 64; + const CRAFTED_TOPK_NUM_BITS: u8 = 5; + + impl CraftedTopkData { + fn new( + exact_dists: &[f32], + lower_bound_margins: &[f32], + error_factors: Vec, + rng: &mut SmallRng, + ) -> Self { + let n = exact_dists.len(); + let code_len = rabit_binary_code_bytes(CRAFTED_TOPK_DIM); + let ex_code_len = blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1); + let add_factors = izip!(exact_dists, lower_bound_margins, &error_factors) + .map(|(dist, margin, error)| dist - margin + error) + .collect(); + Self { + codes: (0..n * code_len).map(|_| rng.random()).collect(), + ex_codes: (0..n * ex_code_len).map(|_| rng.random()).collect(), + dist_table: (0..CRAFTED_TOPK_DIM * 4) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect(), + ex_query: (0..CRAFTED_TOPK_DIM) + .map(|_| rng.random_range(-1.0f32..1.0)) + .collect(), + scale_factors: vec![0.0; n], + add_factors, + error_factors, + ex_scale_factors: vec![0.0; n], + ex_add_factors: exact_dists.to_vec(), + } + } + + fn calculator(&self, approx_mode: ApproxMode) -> RabitDistCalculator<'_> { + RabitDistCalculator::new( + CRAFTED_TOPK_DIM, + CRAFTED_TOPK_NUM_BITS, + RabitQueryEstimator::RawQuery, + Cow::Borrowed(self.dist_table.as_slice()), + Cow::Borrowed(self.ex_query.as_slice()), + 0.7, + &self.codes, + Some(&self.ex_codes), + blocked_ex_code_bytes(CRAFTED_TOPK_DIM, CRAFTED_TOPK_NUM_BITS - 1), + &self.add_factors, + &self.scale_factors, + Some(&self.error_factors), + Some(&self.ex_add_factors), + Some(&self.ex_scale_factors), + None, + 0.0, + 1.0, + approx_mode, + ) + } + } + + fn canonical_heap_rows(heap: BinaryHeap>) -> Vec<(u32, u64)> { + let mut rows = heap + .into_iter() + .map(|node| (node.dist.0.to_bits(), node.id)) + .collect::>(); + rows.sort_unstable(); + rows + } + + /// The dense (SIMD-pruned) scan must reproduce the sparse scalar scan + /// exactly: identical heap contents including row ids, and the k smallest + /// in-bounds exact distances overall. + #[rstest] + fn test_raw_query_multi_bit_topk_dense_matches_sparse( + #[values(ApproxMode::Normal, ApproxMode::Accurate)] approx_mode: ApproxMode, + #[values("descending", "ascending", "random", "duplicates", "duplicate_ties")] + ordering: &str, + ) { + for n in [1usize, 15, 16, 17, 100, 4109] { + let mut rng = SmallRng::seed_from_u64(n as u64 * 31 + ordering.len() as u64); + let exact_dists: Vec = match ordering { + // Improving rows force constant heap updates. + "descending" => (0..n).map(|id| (n - id) as f32).collect(), + // Worsening rows force mass pruning, the common regime. + "ascending" => (0..n).map(|id| id as f32).collect(), + "random" => (0..n).map(|_| rng.random_range(0.0..n as f32)).collect(), + "duplicates" => (0..n).map(|id| (id % 7) as f32).collect(), + // Lower bound equals the distance, so heap-threshold and + // upper-bound comparisons hit exact `>=` ties. + "duplicate_ties" => (0..n).map(|id| (id % 5) as f32).collect(), + _ => unreachable!(), + }; + let (margins, error_factors) = if ordering == "duplicate_ties" { + (vec![0.0; n], vec![0.0; n]) + } else if ordering == "random" { + ( + (0..n).map(|_| rng.random_range(0.0f32..2.0)).collect(), + (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(), + ) + } else { + ( + vec![1.0; n], + (0..n).map(|_| rng.random_range(0.0f32..1.0)).collect(), + ) + }; + let data = CraftedTopkData::new(&exact_dists, &margins, error_factors, &mut rng); + let calc = data.calculator(approx_mode); + assert!( + calc.raw_query_lower_bound_gating_disabled_reason() + .is_none() + ); + + let max_dist = exact_dists.iter().fold(0.0f32, |acc, dist| acc.max(*dist)); + for k in [1usize, 10, n + 7] { + for bounds in [(None, None), (Some(max_dist * 0.25), Some(max_dist * 0.7))] { + let (lower_bound, upper_bound) = bounds; + let mut dense_heap = BinaryHeap::new(); + let mut sparse_heap = BinaryHeap::new(); + let mut dists = Vec::new(); + let mut u16_scratch = Vec::new(); + let mut u8_scratch = Vec::new(); + let mut u32_scratch = Vec::new(); + // Two passes sharing the heap, as IVF partition probing + // does: the second pass starts with a full, tight heap. + for pass in 0..2u64 { + let offset = pass * n as u64; + calc.accumulate_topk_with_scratch( + k, + lower_bound, + upper_bound, + |id| id as u64 + offset, + &mut dense_heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + calc.accumulate_filtered_topk_with_scratch( + k, + lower_bound, + upper_bound, + (0..n as u32).map(|id| (id, id as u64 + offset)), + |_| true, + &mut sparse_heap, + &mut dists, + &mut u16_scratch, + &mut u8_scratch, + &mut u32_scratch, + ); + } + let dense = canonical_heap_rows(dense_heap); + let sparse = canonical_heap_rows(sparse_heap); + assert_eq!( + dense, sparse, + "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}" + ); + + // The distance multiset must be the k smallest in-bounds + // distances over both passes. Row ids are not compared: + // evictions among tied maxima depend on heap layout. + let query_lower_bound = lower_bound.unwrap_or(f32::MIN); + let query_upper_bound = upper_bound.unwrap_or(f32::MAX); + let mut expected = (0..2 * n) + .map(|row| exact_dists[row % n]) + .filter(|dist| *dist >= query_lower_bound && *dist < query_upper_bound) + .map(|dist| dist.to_bits()) + .collect::>(); + expected.sort_unstable(); + expected.truncate(k); + let actual = dense.iter().map(|(dist, _)| *dist).collect::>(); + assert_eq!( + actual, expected, + "ordering={ordering} n={n} k={k} bounds={bounds:?} mode={approx_mode:?}" + ); + } + } + } + } + #[test] fn test_raw_query_one_bit_distance_uses_binary_factors_without_ex_columns() { let code_dim = 8usize; From 72ea21ca537562906d796e8e02cfefa51acca8c4 Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Wed, 17 Jun 2026 00:58:20 +0700 Subject: [PATCH 111/177] feat(index): accelerate regex and infix LIKE with the ngram index (#7139) ## What Extends the ngram index (today only `contains(col, 'substr')`) to also accelerate `regexp_like(col, pat)` / `regexp_match(col, pat)` and infix `LIKE` (`col LIKE '%foo%bar%'`), which until now fell through to a full table scan. Closes #7130. ## How Following Postgres `pg_trgm` and Russ Cox's [trigram-index approach](https://swtch.com/~rsc/regexp/regexp4.html), a pattern is compiled into a boolean trigram condition (`TrigramQuery`, an AND/OR tree) that is a *necessary* condition for any match. This maps onto the inverted index's set algebra: AND is posting-list intersection, OR is union. The index returns a candidate superset and the scan rechecks the true predicate, exactly as `contains` does. The derivation walks the `regex-syntax` HIR bottom-up, tracking per-node `(emptyable, exact, prefix, suffix)` sets and folding boundary trigrams across concatenation, with bounds that fold-then-discard so precision loss never drops a necessary trigram. ### Soundness The derived condition never requires a trigram a matching string could lack, so no real match is dropped. Requirements come from the index's own tokenizer (so sub-trigram runs contribute nothing); character classes and case-insensitive folds are treated as a single unknown character (the index's normalization disagrees with Unicode case folding - `(?i)c` also matches U+2102); and patterns with no derivable trigram (`a.b`, `.*`) are left to a full scan. ### Why infix LIKE A plain-literal `regexp_like(col, 'foo')` is rewritten to `col LIKE '%foo%'` by the optimizer before it reaches the index, so without infix-LIKE the most common "regex" query would not accelerate. The LIKE is translated to a loose regex for candidate generation only; the original LIKE stays as the recheck filter, so the candidate regex need only be a sound superset. ## Benchmark `cargo bench -p lance --bench regex_ngram`, 200k rows, before (main) vs after: | Query | Before | After | Change | |---|---|---|---| | `regexp_match(doc, 'zqxwvu.*needlexyz')` | 45.8 ms | 8.2 ms | -82% | | `regexp_match(doc, '(zqxwvu\|qwerasdf\|needlexyz)')` | 51.7 ms | 11.6 ms | -77% | | `regexp_match(doc, 'zqxwvu')` (rewritten to LIKE) | 36.6 ms | 8.0 ms | -78% | | `regexp_match(doc, 'a.b')` (non-accelerable) | 77.8 ms | 76.2 ms | within noise | ## Testing Unit tests for the regex-to-trigram derivation and the regex-flags folding; index-level search tests (AND across `.*`, alternation union, NULL exclusion, absent trigram); a multi-fragment end-to-end scan test asserting correct results and index engagement for `regexp_like`, `regexp_match`, and `LIKE`, plus a case-insensitive query that must fall back to a full recheck; and a regression test ensuring non-accelerable patterns still return all correct matches via full recheck. No binding changes are needed - Python/Java pass these filter strings through the existing scan API, so acceleration applies transparently. Co-authored-by: Vova Kolmakov Co-authored-by: Claude Opus 4.8 (1M context) --- Cargo.lock | 1 + Cargo.toml | 1 + docs/src/format/index/scalar/ngram.md | 8 +- python/Cargo.lock | 1 + rust/lance-index/Cargo.toml | 1 + rust/lance-index/src/scalar.rs | 24 +- rust/lance-index/src/scalar/expression.rs | 237 +++++- rust/lance-index/src/scalar/fmindex.rs | 9 + rust/lance-index/src/scalar/ngram.rs | 151 +++- .../src/scalar/ngram/ngram_regex.rs | 673 ++++++++++++++++++ rust/lance/Cargo.toml | 4 + rust/lance/benches/regex_ngram.rs | 134 ++++ rust/lance/src/dataset/scanner.rs | 192 +++++ 13 files changed, 1411 insertions(+), 25 deletions(-) create mode 100644 rust/lance-index/src/scalar/ngram/ngram_regex.rs create mode 100644 rust/lance/benches/regex_ngram.rs diff --git a/Cargo.lock b/Cargo.lock index 1e820b0c63d..5bee7197f24 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4835,6 +4835,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "rstest", "serde", diff --git a/Cargo.toml b/Cargo.toml index 68a9002872b..a95f530de38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -180,6 +180,7 @@ rand_distr = { version = "0.5.1" } rand_xoshiro = "0.7.0" rangemap = { version = "1.0" } rayon = "1.10" +regex-syntax = "0.8.10" roaring = "0.11.4" rstest = "0.26.1" serde = { version = "^1" } diff --git a/docs/src/format/index/scalar/ngram.md b/docs/src/format/index/scalar/ngram.md index bdf78474d50..d437363d264 100644 --- a/docs/src/format/index/scalar/ngram.md +++ b/docs/src/format/index/scalar/ngram.md @@ -29,4 +29,10 @@ The N-gram index provides inexact results for the following query types: | Query Type | Description | Operation | Result Type | |----------------|--------------------------|-------------------------------------------------------|-------------| -| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | \ No newline at end of file +| **contains** | Substring search in text | Finds all trigrams in query, intersects posting lists | AtMost | +| **regexp_like** / **regexp_match** | Regular-expression match | Derives a necessary trigram condition from the pattern (AND of intersections, OR of unions), then rechecks the true regex | AtMost | +| **LIKE** (infix) | Wildcard match such as `%foo%bar%` | Uses the literal segments of the pattern as a trigram condition, then rechecks the LIKE | AtMost | + +Patterns from which no trigram can be derived - for example `a.b`, `.*`, +case-insensitive matches, or literal runs shorter than three characters - fall +back to rechecking every row. This is always correct, just not accelerated. diff --git a/python/Cargo.lock b/python/Cargo.lock index 5dfc1ba47e0..37035be3085 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4431,6 +4431,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml index e3947b57856..85de43c0f9b 100644 --- a/rust/lance-index/Cargo.toml +++ b/rust/lance-index/Cargo.toml @@ -56,6 +56,7 @@ object_store.workspace = true prost.workspace = true prost-types.workspace = true rand.workspace = true +regex-syntax.workspace = true roaring.workspace = true rayon.workspace = true serde_json.workspace = true diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 4830586f85c..a287d277a81 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -8,6 +8,7 @@ use arrow_array::{BooleanArray, ListArray, RecordBatch, UInt64Array}; use arrow_schema::{Field, Schema}; use async_trait::async_trait; use bytes::Bytes; +use datafusion::functions::regex::regexplike::RegexpLikeFunc; use datafusion::functions::string::contains::ContainsFunc; use datafusion::functions_nested::array_has; use datafusion::physical_plan::SendableRecordBatchStream; @@ -649,9 +650,15 @@ impl AnyQuery for LabelListQuery { pub enum TextQuery { /// Retrieve all row ids where the text contains the given string StringContains(String), - // TODO: In the future we should be able to do string-insensitive contains - // as well as partial matches (e.g. LIKE 'foo%') and potentially even - // some regular expressions + /// Retrieve all row ids whose text matches the given regular expression. + /// + /// The pattern is a full regular expression (as accepted by `regexp_like`). + /// The index returns a candidate superset that the scan rechecks, so any + /// pattern is sound; patterns with no usable trigram structure simply fall + /// back to rechecking every row. + Regex(String), + // TODO: In the future we should be able to do case-insensitive contains + // as well as partial matches (e.g. LIKE 'foo%'). } impl AnyQuery for TextQuery { @@ -672,6 +679,17 @@ impl AnyQuery for TextQuery { Expr::Literal(ScalarValue::Utf8(Some(substr.clone())), None), ], }), + // `regexp_like` returns Boolean directly, so the reconstructed + // expression can be used as-is for the recheck filter (no IsNotNull + // wrapper, unlike `regexp_match`). It is the semantic equivalent of + // the original predicate for the "does it match" question. + Self::Regex(pattern) => Expr::ScalarFunction(ScalarFunction { + func: Arc::new(RegexpLikeFunc::new().into()), + args: vec![ + Expr::Column(Column::new_unqualified(col)), + Expr::Literal(ScalarValue::Utf8(Some(pattern.clone())), None), + ], + }), } } diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 38a29e9c43c..ea7fbabc813 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -781,20 +781,28 @@ impl ScalarQueryParser for LabelListQueryParser { } } -/// A parser for indices that handle string contains queries +/// A parser for indices that handle string `contains` queries, and -- when +/// `supports_regex` is set -- `regexp_like` / `regexp_match` queries. #[derive(Debug, Clone)] pub struct TextQueryParser { index_name: String, index_type: String, needs_recheck: bool, + supports_regex: bool, } impl TextQueryParser { - pub fn new(index_name: String, index_type: String, needs_recheck: bool) -> Self { + pub fn new( + index_name: String, + index_type: String, + needs_recheck: bool, + supports_regex: bool, + ) -> Self { Self { index_name, index_type, needs_recheck, + supports_regex, } } } @@ -837,31 +845,156 @@ impl ScalarQueryParser for TextQueryParser { func: &ScalarUDF, args: &[Expr], ) -> Option { - if args.len() != 2 { + // The first argument is the indexed column; the second is the substring + // / pattern. `contains` takes exactly two arguments; the regex functions + // optionally take a third flags argument. + if args.len() < 2 { return None; } - let scalar = maybe_scalar(&args[1], data_type)?; - match scalar { - ScalarValue::Utf8(Some(scalar_str)) | ScalarValue::LargeUtf8(Some(scalar_str)) => { - if func.name() == "contains" { - let query = TextQuery::StringContains(scalar_str); - Some(IndexedExpression::index_query_with_recheck( - column.to_string(), - self.index_name.clone(), - self.index_type.clone(), - Arc::new(query), - self.needs_recheck, - )) - } else { + // A non-string pattern cannot be handled. + let (ScalarValue::Utf8(Some(pattern)) | ScalarValue::LargeUtf8(Some(pattern))) = + maybe_scalar(&args[1], data_type)? + else { + return None; + }; + + let query = match func.name() { + "contains" if args.len() == 2 => TextQuery::StringContains(pattern), + "regexp_like" | "regexp_match" if self.supports_regex => { + let pattern = match args.get(2) { + Some(flags_expr) => apply_regex_flags(&pattern, flags_expr)?, + None => pattern, + }; + // If the pattern yields no usable trigram (e.g. `a.b`), leave it + // to a full scan instead of routing it to the index, which could + // only answer with an unsupported "recheck everything" result. + if !crate::scalar::ngram::regex_can_use_index(&pattern) { + return None; + } + TextQuery::Regex(pattern) + } + _ => return None, + }; + + Some(IndexedExpression::index_query_with_recheck( + column.to_string(), + self.index_name.clone(), + self.index_type.clone(), + Arc::new(query), + self.needs_recheck, + )) + } + + fn visit_like( + &self, + column: &str, + like: &Like, + pattern: &ScalarValue, + ) -> Option { + // Infix LIKE is accelerated only by the ngram index (via its regex + // machinery). A plain-literal `regexp_like(col, 'foo')` is rewritten to + // `col LIKE '%foo%'` before it reaches the index, so this is the path + // that accelerates those. ILIKE is skipped because its case folding does + // not match the index's normalization. + if !self.supports_regex || like.case_insensitive { + return None; + } + let pattern_str = match pattern { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => s.as_str(), + _ => return None, + }; + // Translate the LIKE pattern into a loose regex used only for candidate + // generation; the original LIKE stays as the recheck filter, so the + // regex only needs to be a sound superset. + let regex = like_to_regex(pattern_str, like.escape_char)?; + if !crate::scalar::ngram::regex_can_use_index(®ex) { + return None; + } + Some(IndexedExpression { + scalar_query: Some(ScalarIndexExpr::Query(ScalarIndexSearch { + column: column.to_string(), + index_name: self.index_name.clone(), + index_type: self.index_type.clone(), + query: Arc::new(TextQuery::Regex(regex)), + needs_recheck: self.needs_recheck, + fragment_bitmap: None, + })), + refine_expr: Some(Expr::Like(like.clone())), + }) + } +} + +/// Translate a LIKE pattern into a regular expression used purely for ngram +/// candidate generation: `%` becomes `.*`, `_` becomes `.`, and literal +/// characters are regex-escaped. Returns `None` when no literal run is long +/// enough to yield a trigram (the index could not help, so a full scan is left +/// to handle it). +fn like_to_regex(pattern: &str, escape: Option) -> Option { + let mut regex = String::new(); + let mut run = 0usize; + let mut longest_run = 0usize; + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + let literal = if Some(c) == escape { + // The next character is escaped, i.e. a literal. + chars.next() + } else { + match c { + '%' => { + regex.push_str(".*"); + run = 0; None } + '_' => { + regex.push('.'); + run = 0; + None + } + other => Some(other), } - _ => { - // If the scalar is not a string, we cannot handle it - None + }; + if let Some(lit) = literal { + if regex_syntax::is_meta_character(lit) { + regex.push('\\'); + } + regex.push(lit); + // Only runs of alphanumeric characters can produce a trigram. + if lit.is_alphanumeric() { + run += 1; + longest_run = longest_run.max(run); + } else { + run = 0; } } } + (longest_run >= 3).then_some(regex) +} + +/// Fold the supported `regexp_like` / `regexp_match` flags into an inline prefix +/// on the pattern (e.g. flags `"i"` -> `"(?i)pattern"`). Returns `None` for a +/// non-literal flags argument or an unrecognized flag, so the caller leaves the +/// predicate to a full recheck rather than risk changing its semantics. +fn apply_regex_flags(pattern: &str, flags_expr: &Expr) -> Option { + let (Expr::Literal(ScalarValue::Utf8(Some(flags)), _) + | Expr::Literal(ScalarValue::LargeUtf8(Some(flags)), _)) = flags_expr + else { + return None; + }; + let mut inline = String::new(); + for flag in flags.chars() { + // Only flags expressible as an inline `(?...)` group in the regex crate + // (which the recheck uses) are safe to fold. + if ['i', 's', 'm', 'x'].contains(&flag) { + inline.push(flag); + } else { + return None; + } + } + if inline.is_empty() { + Some(pattern.to_string()) + } else { + Some(format!("(?{inline}){pattern}")) + } } /// A parser for indices that handle queries with the contains_tokens function @@ -1813,7 +1946,18 @@ fn visit_node( Expr::IsFalse(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, false)), Expr::IsTrue(expr) => Ok(visit_is_bool(expr.as_ref(), index_info, true)), Expr::IsNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, false)), - Expr::IsNotNull(expr) => Ok(visit_is_null(expr.as_ref(), index_info, true)), + Expr::IsNotNull(expr) => { + // `regexp_match(col, pat)` returns a list and is coerced to + // `IsNotNull(regexp_match(...))` before it reaches here. Unwrap that + // so the regex acceleration applies; everything else is a genuine + // IS NOT NULL check. + if let Expr::ScalarFunction(scalar_fn) = expr.as_ref() + && scalar_fn.func.name() == "regexp_match" + { + return Ok(visit_scalar_fn(scalar_fn, index_info)); + } + Ok(visit_is_null(expr.as_ref(), index_info, true)) + } Expr::Not(expr) => visit_not(expr.as_ref(), index_info, depth), Expr::BinaryExpr(binary_expr) => visit_binary_expr(binary_expr, index_info, depth), Expr::ScalarFunction(scalar_fn) => Ok(visit_scalar_fn(scalar_fn, index_info)), @@ -2690,6 +2834,59 @@ mod tests { assert!(matches!(negated.upper, NullableRowAddrMask::BlockList(_))); } + #[test] + fn test_like_to_regex() { + // `%` -> `.*`, `_` -> `.`, with a literal run of at least three chars. + assert_eq!(like_to_regex("%foo%", None).as_deref(), Some(".*foo.*")); + assert_eq!(like_to_regex("foo%bar", None).as_deref(), Some("foo.*bar")); + assert_eq!(like_to_regex("foo_bar", None).as_deref(), Some("foo.bar")); + assert_eq!(like_to_regex("foobar", None).as_deref(), Some("foobar")); + + // Regex metacharacters in the literal portion are escaped. + assert_eq!( + like_to_regex("%a.bcd%", None).as_deref(), + Some(".*a\\.bcd.*") + ); + + // No literal run of three alphanumeric characters -> no index help. + assert_eq!(like_to_regex("%ab%", None), None); + assert_eq!(like_to_regex("%a%b%c%", None), None); + assert_eq!(like_to_regex("%", None), None); + + // The escape character makes the following character a literal. + assert_eq!( + like_to_regex(r"%foo\%bar%", Some('\\')).as_deref(), + Some(".*foo%bar.*") + ); + } + + #[test] + fn test_apply_regex_flags() { + fn flags(s: &str) -> Expr { + Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None) + } + + // Empty flags leave the pattern untouched (no inline group emitted). + assert_eq!(apply_regex_flags("foo", &flags("")).as_deref(), Some("foo")); + // Supported flags are folded into an inline `(?...)` prefix. + assert_eq!( + apply_regex_flags("foo", &flags("i")).as_deref(), + Some("(?i)foo") + ); + assert_eq!( + apply_regex_flags("foo", &flags("is")).as_deref(), + Some("(?is)foo") + ); + // An unrecognized flag bails out so the caller leaves the predicate to a + // full recheck rather than risk changing its semantics. + assert_eq!(apply_regex_flags("foo", &flags("g")), None); + // A non-string (hence non-literal-flags) argument cannot be folded. + assert_eq!( + apply_regex_flags("foo", &Expr::Literal(ScalarValue::Int32(Some(1)), None)), + None + ); + } + #[test] fn test_extract_like_leading_prefix() { // Simple prefix patterns (no recheck needed) diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index 9677f7471ea..aed1136535a 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -1352,6 +1352,12 @@ impl ScalarIndex for FMIndexScalarIndex { Default::default(), ))) } + // Regex queries are routed only to the ngram index (the FM-index's + // query parser advertises `supports_regex = false`), so this is + // unreachable in practice; reject it explicitly rather than silently. + TextQuery::Regex(_) => Err(Error::invalid_input( + "FMIndex does not support regular expression queries", + )), } } fn can_remap(&self) -> bool { @@ -1645,6 +1651,9 @@ impl ScalarIndexPlugin for FMIndexPlugin { Some(Box::new(TextQueryParser::new( index_name, self.name().to_string(), + // needs_recheck: the FM-index returns exact substring matches. + false, + // supports_regex: regex acceleration is only implemented for ngram. false, ))) } diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index 72ef8d53a92..b452ef78c85 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -5,7 +5,10 @@ use std::any::Any; use std::collections::BTreeMap; use std::iter::once; use std::time::Instant; -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use super::lance_format::LanceIndexStore; use super::{ @@ -49,6 +52,9 @@ use roaring::{RoaringBitmap, RoaringTreemap}; use serde::Serialize; use tracing::instrument; +mod ngram_regex; +pub(crate) use ngram_regex::regex_can_use_index; + const TOKENS_COL: &str = "tokens"; const POSTING_LIST_COL: &str = "posting_list"; const POSTINGS_FILENAME: &str = "ngram_postings.lance"; @@ -476,6 +482,45 @@ impl ScalarIndex for NGramIndex { let row_ids = NGramPostingList::intersect(list_refs); Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids))) } + TextQuery::Regex(pattern) => { + let trigram_query = ngram_regex::regex_to_trigram_query(pattern); + match &trigram_query { + // No usable trigram structure (e.g. `a.b`, `.*`): the index + // cannot prune, so every row must be rechecked. + ngram_regex::TrigramQuery::All => { + Ok(SearchResult::at_least(RowAddrTreeMap::new())) + } + // The pattern is provably unsatisfiable. + ngram_regex::TrigramQuery::None => { + Ok(SearchResult::exact(RowAddrTreeMap::new())) + } + _ => { + let mut tokens = HashSet::new(); + ngram_regex::collect_tokens(&trigram_query, &mut tokens); + // Fetch the posting list for every trigram the condition + // references; a token absent from the index contributes + // an empty list, which `eval_trigram_query` handles. + let present = tokens.into_iter().filter_map(|token| { + self.tokens.get(&token).map(|offset| (token, *offset)) + }); + let lists = futures::stream::iter(present.map(|(token, offset)| { + self.list_reader + .ngram_list(offset, metrics) + .map(move |result| result.map(|list| (token, list))) + })) + .buffer_unordered(self.io_parallelism) + .try_collect::)>>() + .await?; + metrics.record_comparisons(lists.len()); + let bitmaps: HashMap = lists + .into_iter() + .map(|(token, list)| (token, list.bitmap.clone())) + .collect(); + let row_ids = ngram_regex::eval_trigram_query(&trigram_query, &bitmaps); + Ok(SearchResult::at_most(RowAddrTreeMap::from(row_ids))) + } + } + } } } @@ -1279,6 +1324,9 @@ impl ScalarIndexPlugin for NGramIndexPlugin { Some(Box::new(TextQueryParser::new( index_name, self.name().to_string(), + // needs_recheck: ngram results are an inexact candidate superset. + true, + // supports_regex: the ngram index can answer regex queries. true, ))) } @@ -1538,6 +1586,107 @@ mod tests { assert_eq!(expected, res); } + #[test_log::test(tokio::test)] + async fn test_ngram_regex_search() { + // Same corpus as test_basic_ngram_index. + let data = StringArray::from_iter_values([ + "cat", // 0 + "dog", // 1 + "cat dog", // 2 + "dog cat", // 3 + "elephant", // 4 + "mouse", // 5 + "rhino", // 6 + "giraffe", // 7 + "rhinos nose", // 8 + ]); + let row_ids = UInt64Array::from_iter_values((0..data.len()).map(|i| i as u64)); + let schema = Arc::new(Schema::new(vec![ + Field::new(VALUE_COLUMN_NAME, DataType::Utf8, false), + Field::new(ROW_ID, DataType::UInt64, false), + ])); + let data = + RecordBatch::try_new(schema.clone(), vec![Arc::new(data), Arc::new(row_ids)]).unwrap(); + let data = Box::pin(RecordBatchStreamAdapter::new( + schema, + stream::once(std::future::ready(Ok(data))), + )); + + let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap(); + let (index, _tmpdir) = do_train(builder, data).await; + + async fn search(index: &NGramIndex, pattern: &str) -> SearchResult { + index + .search( + &TextQuery::Regex(pattern.to_string()), + &NoOpMetricsCollector, + ) + .await + .unwrap() + } + + // A plain literal yields the same candidates as contains("cat"). + assert_eq!( + search(&index, "cat").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 2, 3])) + ); + + // Alternation -> union of each branch's rows. + assert_eq!( + search(&index, "(cat|dog)").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 2, 3])) + ); + + // AND across `.*`: must contain both the `rhino` and `nose` trigrams, so + // row 6 ("rhino") is correctly excluded and only row 8 survives. + assert_eq!( + search(&index, "rhino.*nose").await, + SearchResult::at_most(RowAddrTreeMap::from_iter([8])) + ); + + // No derivable trigram -> recheck everything. + assert_eq!( + search(&index, "a.b").await, + SearchResult::at_least(RowAddrTreeMap::new()) + ); + + // A trigram that is absent from the index -> empty candidate set. + assert_eq!( + search(&index, "zzz").await, + SearchResult::at_most(RowAddrTreeMap::new()) + ); + } + + #[test_log::test(tokio::test)] + async fn test_ngram_regex_search_nulls() { + // Rows: cat(0), dog(1), NULL(2), NULL(3), cat dog(4). + let data = simple_data_with_nulls(); + let builder = NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap(); + let (index, _tmpdir) = do_train(builder, data).await; + + // The NULL rows (2, 3) must never appear in the candidate set. + let res = index + .search(&TextQuery::Regex("cat".to_string()), &NoOpMetricsCollector) + .await + .unwrap(); + assert_eq!( + res, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 4])) + ); + + let res = index + .search( + &TextQuery::Regex("(cat|dog)".to_string()), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + assert_eq!( + res, + SearchResult::at_most(RowAddrTreeMap::from_iter([0, 1, 4])) + ); + } + fn test_data_schema() -> Arc { Arc::new(Schema::new(vec![ Field::new(VALUE_COLUMN_NAME, DataType::Utf8, true), diff --git a/rust/lance-index/src/scalar/ngram/ngram_regex.rs b/rust/lance-index/src/scalar/ngram/ngram_regex.rs new file mode 100644 index 00000000000..ee67c479a71 --- /dev/null +++ b/rust/lance-index/src/scalar/ngram/ngram_regex.rs @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Deriving a trigram pre-filter from a regular expression. +//! +//! This is the query-side counterpart of the ngram index that lets us +//! accelerate `regexp_like` / `regexp_match` predicates the same way the index +//! already accelerates `contains`. The idea (the same one Postgres `pg_trgm` +//! and Russ Cox's Google Code Search use) is to derive, from the regex, a +//! boolean condition over trigram presence that is *necessary* for any string +//! to match, evaluate it against the inverted index, and let the scan recheck +//! the true regex on the surviving rows. +//! +//! The derived condition is a [`TrigramQuery`] -- an AND/OR tree of trigram +//! tokens. `AND` maps onto posting-list intersection and `OR` onto union, which +//! is exactly the set algebra the ngram index is built for. +//! +//! # Soundness +//! +//! The single invariant that matters is that the condition must never require a +//! trigram that a matching string could lack -- otherwise we would drop real +//! matches (a false negative, far worse than a false positive, which the recheck +//! removes). Everything here is therefore a conservative *over*-approximation: +//! when in doubt we emit [`TrigramQuery::All`] ("no constraint, recheck +//! everything"). Concretely: +//! +//! * Every trigram requirement is produced by [`trigrams_of_string`], which runs +//! the *same* tokenizer the index was built with, so a string shorter than a +//! trigram (or with no alphanumeric run) contributes no requirement. +//! * Character classes and case-insensitive folds are treated as a single +//! unknown character (`All`), because the index's normalization does not agree +//! with Unicode case folding (e.g. `(?i)c` also matches `ℂ`, which the index +//! does not fold to `c`). Literal runs -- the common case -- are fully used. +//! * When the exact / prefix / suffix string sets grow past a bound we first fold +//! their trigrams into the running condition and only then drop the strings, so +//! collapsing precision never removes a necessary trigram. + +use std::collections::{BTreeSet, HashMap, HashSet}; + +use regex_syntax::hir::{Class, Hir, HirKind}; +use roaring::RoaringTreemap; + +use super::{NGRAM_N, NGRAM_TOKENIZER, ngram_to_token, tokenize_visitor}; + +/// Maximum number of strings kept in an `exact` / `prefix` / `suffix` set before +/// it is folded into the trigram condition and dropped. +const MAX_SET_SIZE: usize = 16; +/// Maximum length (in characters) of a string kept in a set. Longer strings are +/// trimmed to a sound shorter affix. +const MAX_STRING_LEN: usize = 32; + +/// A boolean condition over trigram presence that is *necessary* for a regex to +/// match. `All` means "no constraint" and `None` means "unsatisfiable"; by +/// construction these only ever appear at the root of the tree. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum TrigramQuery { + /// No constraint: every row is a candidate (the scan must recheck all rows). + All, + /// Unsatisfiable: no row can match. + None, + /// The given trigram token must be present. + Trigram(u32), + /// Every child condition must hold (posting-list intersection). + And(Vec), + /// At least one child condition must hold (posting-list union). + Or(Vec), +} + +impl TrigramQuery { + /// Build an `AND` of conditions, applying identity (`All`), absorbing + /// (`None`), flattening, sorting and de-duplication so the result is + /// canonical and free of nested `All`/`None`. + fn and(items: Vec) -> Self { + let mut flat = Vec::with_capacity(items.len()); + for item in items { + match item { + Self::All => {} // identity + Self::None => return Self::None, // absorbing + Self::And(children) => flat.extend(children), // flatten + other => flat.push(other), + } + } + flat.sort(); + flat.dedup(); + match flat.len() { + 0 => Self::All, + 1 => flat.pop().unwrap(), + _ => Self::And(flat), + } + } + + /// Build an `OR` of conditions, applying absorbing (`All`), identity + /// (`None`), flattening, sorting and de-duplication. + fn or(items: Vec) -> Self { + let mut flat = Vec::with_capacity(items.len()); + for item in items { + match item { + Self::All => return Self::All, // absorbing + Self::None => {} // identity + Self::Or(children) => flat.extend(children), // flatten + other => flat.push(other), + } + } + flat.sort(); + flat.dedup(); + match flat.len() { + 0 => Self::None, + 1 => flat.pop().unwrap(), + _ => Self::Or(flat), + } + } +} + +/// Information about the set of strings a sub-expression can match, used to +/// build a necessary trigram condition bottom-up. For every string `s` the +/// sub-expression matches: `s` is in `exact` (when it is `Some`), `s` starts +/// with some member of `prefix` and ends with some member of `suffix`, and `s` +/// satisfies `match_q`. +struct RegexInfo { + /// Whether the sub-expression can match the empty string. + emptyable: bool, + /// The complete set of strings the sub-expression matches, or `None` if that + /// set is unbounded / unknown. + exact: Option>, + /// Strings that every match must start with (empty = unknown). + prefix: BTreeSet, + /// Strings that every match must end with (empty = unknown). + suffix: BTreeSet, + /// A necessary trigram condition for the sub-expression. + match_q: TrigramQuery, +} + +impl RegexInfo { + /// The empty string (also used for zero-width anchors): matches only `""`. + fn empty_string() -> Self { + let empty = BTreeSet::from([String::new()]); + Self { + emptyable: true, + exact: Some(empty.clone()), + prefix: empty.clone(), + suffix: empty, + match_q: TrigramQuery::All, + } + } + + /// A fixed literal string. + fn literal(s: &str) -> Self { + let set = BTreeSet::from([s.to_string()]); + Self { + emptyable: s.is_empty(), + exact: Some(set.clone()), + prefix: set.clone(), + suffix: set, + match_q: trigrams_of_string(s), + } + } + + /// A single unknown character (a character class we cannot pin down). + fn any_char() -> Self { + Self { + emptyable: false, + exact: None, + prefix: BTreeSet::new(), + suffix: BTreeSet::new(), + match_q: TrigramQuery::All, + } + } + + /// Enforce the size/length bounds, folding any information about to be + /// discarded into `match_q` first so that precision loss never drops a + /// necessary trigram. Idempotent. + fn bound(&mut self) { + let oversized_exact = self.exact.as_ref().is_some_and(|exact| { + exact.len() > MAX_SET_SIZE || exact.iter().any(|s| s.chars().count() > MAX_STRING_LEN) + }); + if oversized_exact { + let exact = self.exact.take().expect("checked above"); + self.fold_into_match(&exact); + } + + self.prefix = self + .prefix + .iter() + .map(|s| leading(s, MAX_STRING_LEN)) + .collect(); + if self.prefix.len() > MAX_SET_SIZE { + let prefix = std::mem::take(&mut self.prefix); + self.fold_into_match(&prefix); + } + + self.suffix = self + .suffix + .iter() + .map(|s| trailing(s, MAX_STRING_LEN)) + .collect(); + if self.suffix.len() > MAX_SET_SIZE { + let suffix = std::mem::take(&mut self.suffix); + self.fold_into_match(&suffix); + } + } + + /// AND the trigrams of `set` (a complete set of possible affixes/strings) + /// into `match_q`. Sound because the set is exhaustive for its role. + fn fold_into_match(&mut self, set: &BTreeSet) { + let folded = trigrams_of_set(set.iter()); + let current = std::mem::replace(&mut self.match_q, TrigramQuery::All); + self.match_q = TrigramQuery::and(vec![current, folded]); + } +} + +/// AND together the trigrams of `s`. Reuses the index's own tokenizer so the +/// tokens are normalized (lowercase, ASCII-folded, alphanumeric-bounded) +/// exactly as they were stored. Returns `All` if `s` yields no trigram (too +/// short, or no run of three alphanumeric characters). +fn trigrams_of_string(s: &str) -> TrigramQuery { + let mut tokens = Vec::new(); + tokenize_visitor(&NGRAM_TOKENIZER, s, |ngram| { + tokens.push(TrigramQuery::Trigram(ngram_to_token(ngram, NGRAM_N))); + }); + TrigramQuery::and(tokens) +} + +/// OR together the trigram conditions of each string in `set`. An empty set +/// means "unknown" and yields `All` (no constraint); if any member yields `All` +/// the whole OR is `All`. +fn trigrams_of_set<'a>(set: impl IntoIterator) -> TrigramQuery { + let queries: Vec<_> = set.into_iter().map(|s| trigrams_of_string(s)).collect(); + if queries.is_empty() { + return TrigramQuery::All; + } + TrigramQuery::or(queries) +} + +/// Concatenate every string in `a` with every string in `b`. +fn cross_concat(a: &BTreeSet, b: &BTreeSet) -> BTreeSet { + let mut out = BTreeSet::new(); + for x in a { + for y in b { + out.insert(format!("{x}{y}")); + } + } + out +} + +/// The first `n` characters of `s` (a sound shorter prefix). +fn leading(s: &str, n: usize) -> String { + s.chars().take(n).collect() +} + +/// The last `n` characters of `s` (a sound shorter suffix). +fn trailing(s: &str, n: usize) -> String { + let count = s.chars().count(); + s.chars().skip(count.saturating_sub(n)).collect() +} + +/// If `class` matches exactly one scalar value, return that character. +fn singleton_char(class: &Class) -> Option { + match class { + Class::Unicode(u) => { + let ranges = u.ranges(); + match ranges { + [r] if r.start() == r.end() => Some(r.start()), + _ => None, + } + } + Class::Bytes(b) => { + let ranges = b.ranges(); + match ranges { + [r] if r.start() == r.end() && r.start() < 0x80 => Some(r.start() as char), + _ => None, + } + } + } +} + +/// Compute the [`RegexInfo`] for `hir` bottom-up. +fn analyze(hir: &Hir) -> RegexInfo { + let mut info = match hir.kind() { + // Zero-width: the empty match. Anchors (^, $, \b) carry no trigram. + HirKind::Empty | HirKind::Look(_) => RegexInfo::empty_string(), + HirKind::Literal(lit) => match std::str::from_utf8(&lit.0) { + Ok(s) => RegexInfo::literal(s), + // A literal that is not valid UTF-8 cannot be reasoned about here. + Err(_) => RegexInfo::any_char(), + }, + HirKind::Class(class) => match singleton_char(class) { + Some(ch) => RegexInfo::literal(ch.encode_utf8(&mut [0u8; 4])), + None => RegexInfo::any_char(), + }, + HirKind::Repetition(rep) => { + let inner = analyze(&rep.sub); + let at_least_one = rep.min >= 1; + RegexInfo { + emptyable: !at_least_one || inner.emptyable, + // We do not unroll bounded repetitions, so the matched set is + // unbounded as far as we are concerned. + exact: None, + prefix: if at_least_one { + inner.prefix.clone() + } else { + BTreeSet::new() + }, + suffix: if at_least_one { + inner.suffix.clone() + } else { + BTreeSet::new() + }, + // Only a required occurrence (min >= 1) contributes; the single + // inner match is necessary, never multiplied. + match_q: if at_least_one { + inner.match_q + } else { + TrigramQuery::All + }, + } + } + HirKind::Capture(cap) => analyze(&cap.sub), + HirKind::Concat(subs) => analyze_concat(subs), + HirKind::Alternation(subs) => analyze_alternation(subs), + }; + info.bound(); + info +} + +fn analyze_concat(subs: &[Hir]) -> RegexInfo { + let mut acc = RegexInfo::empty_string(); + for sub in subs { + acc = concat_info(acc, analyze(sub)); + } + acc +} + +/// Combine two adjacent sub-expressions. This is the subtle part: it recovers +/// trigrams that straddle the junction via the cross product of `acc.suffix` and +/// `next.prefix`. +fn concat_info(acc: RegexInfo, next: RegexInfo) -> RegexInfo { + let emptyable = acc.emptyable && next.emptyable; + + // Trigrams spanning the junction (computed from the pre-merge affixes). + let boundary = if acc.suffix.is_empty() || next.prefix.is_empty() { + TrigramQuery::All + } else { + trigrams_of_set(cross_concat(&acc.suffix, &next.prefix).iter()) + }; + + // exact = acc.exact x next.exact, only while both are finite and small. + let exact = match (&acc.exact, &next.exact) { + (Some(a), Some(b)) if a.len().saturating_mul(b.len()) <= MAX_SET_SIZE => { + Some(cross_concat(a, b)) + } + _ => None, + }; + + // A match starts with acc's full string (when known) then next's prefix, + // otherwise with acc's own prefix. + let prefix = match &acc.exact { + Some(a) if !next.prefix.is_empty() => cross_concat(a, &next.prefix), + Some(a) => a.clone(), + None => acc.prefix.clone(), + }; + + // Mirror image for the suffix (driven by the right side). + let suffix = match &next.exact { + Some(b) if !acc.suffix.is_empty() => cross_concat(&acc.suffix, b), + Some(b) => b.clone(), + None => next.suffix.clone(), + }; + + let match_q = TrigramQuery::and(vec![acc.match_q, next.match_q, boundary]); + + let mut info = RegexInfo { + emptyable, + exact, + prefix, + suffix, + match_q, + }; + info.bound(); + info +} + +fn analyze_alternation(subs: &[Hir]) -> RegexInfo { + let infos: Vec = subs.iter().map(analyze).collect(); + + let emptyable = infos.iter().any(|i| i.emptyable); + + let exact = if infos.iter().all(|i| i.exact.is_some()) { + Some( + infos + .iter() + .flat_map(|i| i.exact.as_ref().unwrap().iter().cloned()) + .collect(), + ) + } else { + None + }; + + // A common prefix exists only if every branch contributes one. + let prefix = if infos.iter().all(|i| !i.prefix.is_empty()) { + infos + .iter() + .flat_map(|i| i.prefix.iter().cloned()) + .collect() + } else { + BTreeSet::new() + }; + let suffix = if infos.iter().all(|i| !i.suffix.is_empty()) { + infos + .iter() + .flat_map(|i| i.suffix.iter().cloned()) + .collect() + } else { + BTreeSet::new() + }; + + let match_q = TrigramQuery::or(infos.into_iter().map(|i| i.match_q).collect()); + + RegexInfo { + emptyable, + exact, + prefix, + suffix, + match_q, + } +} + +/// Derive a necessary trigram condition from a regular expression pattern. +/// +/// Returns [`TrigramQuery::All`] when no useful condition can be derived (an +/// unparsable pattern, or one with no trigram-able literal structure such as +/// `a.b` or `.*`); callers must treat that as "recheck everything". +pub fn regex_to_trigram_query(pattern: &str) -> TrigramQuery { + // An unparsable pattern cannot be accelerated; rechecking is still safe. + let Ok(hir) = regex_syntax::parse(pattern) else { + return TrigramQuery::All; + }; + let info = analyze(&hir); + + let mut conditions = vec![info.match_q]; + if let Some(exact) = &info.exact { + if exact.is_empty() { + // The expression matches nothing. + return TrigramQuery::None; + } + conditions.push(trigrams_of_set(exact.iter())); + } + conditions.push(trigrams_of_set(info.prefix.iter())); + conditions.push(trigrams_of_set(info.suffix.iter())); + TrigramQuery::and(conditions) +} + +/// Whether a regular expression yields any trigram condition the index can use +/// to prune candidates. When it does not (e.g. `a.b`, `.*`, or a case-insensitive +/// pattern), callers should leave the predicate to a full scan rather than route +/// it to the index, which would otherwise have to ask the scan to recheck every +/// row -- a path the index result type (`AtLeast`) does not support. +pub fn regex_can_use_index(pattern: &str) -> bool { + regex_to_trigram_query(pattern) != TrigramQuery::All +} + +/// Collect the distinct trigram tokens referenced anywhere in the tree. +pub fn collect_tokens(query: &TrigramQuery, out: &mut HashSet) { + match query { + TrigramQuery::Trigram(token) => { + out.insert(*token); + } + TrigramQuery::And(items) | TrigramQuery::Or(items) => { + for item in items { + collect_tokens(item, out); + } + } + TrigramQuery::All | TrigramQuery::None => {} + } +} + +/// Evaluate the tree against a map of `trigram token -> posting list`. A token +/// missing from the map contributes an empty set (sound: a required trigram that +/// is absent everywhere yields no rows; an absent OR branch contributes +/// nothing). `All` / `None` are handled by the caller before evaluation. +pub fn eval_trigram_query( + query: &TrigramQuery, + bitmaps: &HashMap, +) -> RoaringTreemap { + match query { + TrigramQuery::Trigram(token) => bitmaps.get(token).cloned().unwrap_or_default(), + TrigramQuery::And(items) => { + let mut iter = items.iter(); + let mut acc = match iter.next() { + Some(first) => eval_trigram_query(first, bitmaps), + None => return RoaringTreemap::new(), + }; + for item in iter { + if acc.is_empty() { + break; + } + acc &= &eval_trigram_query(item, bitmaps); + } + acc + } + TrigramQuery::Or(items) => { + let mut acc = RoaringTreemap::new(); + for item in items { + acc |= &eval_trigram_query(item, bitmaps); + } + acc + } + TrigramQuery::All | TrigramQuery::None => RoaringTreemap::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A single trigram condition, hashed the same way the index hashes it. + fn tri(trigram: &str) -> TrigramQuery { + TrigramQuery::Trigram(ngram_to_token(trigram, NGRAM_N)) + } + + fn q(pattern: &str) -> TrigramQuery { + regex_to_trigram_query(pattern) + } + + #[test] + fn test_single_literal_trigram() { + assert_eq!(q("foo"), tri("foo")); + } + + #[test] + fn test_multi_trigram_literal() { + assert_eq!( + q("foobar"), + TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")]) + ); + } + + #[test] + fn test_wildcard_splits_into_and() { + // `.*` breaks the literal run; both sides are required. + assert_eq!( + q("foo.*bar"), + TrigramQuery::and(vec![tri("foo"), tri("bar")]) + ); + } + + #[test] + fn test_alternation_is_or() { + assert_eq!( + q("(cat|dog)"), + TrigramQuery::or(vec![tri("cat"), tri("dog")]) + ); + } + + #[test] + fn test_anchors_are_transparent() { + assert_eq!( + q("^rhino"), + TrigramQuery::and(vec![tri("rhi"), tri("hin"), tri("ino")]) + ); + assert_eq!(q("nose$"), TrigramQuery::and(vec![tri("nos"), tri("ose")])); + } + + #[test] + fn test_boundary_trigram_recovered_across_groups() { + // A capturing group is not merged into the adjacent literals, so this + // exercises the suffix x prefix cross product that recovers the `foo` + // trigram straddling the `(o)` group boundary in "foobar". + assert_eq!( + q("fo(o)bar"), // spellchecker:disable-line + TrigramQuery::and(vec![tri("foo"), tri("oob"), tri("oba"), tri("bar")]) + ); + } + + #[test] + fn test_no_trigram_yields_all() { + // No run of three literal characters anywhere. + assert_eq!(q("a.b"), TrigramQuery::All); + assert_eq!(q(".*"), TrigramQuery::All); + // Every alternation branch is shorter than a trigram, so we must not + // require either two-character branch as a (non-existent) trigram. + assert_eq!(q("fo|ba"), TrigramQuery::All); // spellchecker:disable-line + } + + #[test] + fn test_case_insensitive_not_accelerated() { + // Unicode case folding (e.g. `(?i)c` also matches U+2102) does not agree + // with the index's normalization, so case-insensitive patterns are left + // unaccelerated (correct via recheck) rather than risk a false negative. + assert_eq!(q("(?i)Cat"), TrigramQuery::All); + } + + #[test] + fn test_unparsable_pattern_yields_all() { + assert_eq!(q("("), TrigramQuery::All); + } + + #[test] + fn test_large_alternation_stays_bounded() { + // More than MAX_SET_SIZE branches: must still produce a sound OR without + // panicking or exploding. + let pattern = (0..40) + .map(|i| format!("aa{i:02}zz")) + .collect::>() + .join("|"); + let result = q(&pattern); + // Each branch shares the trigram `aa0`/`aa1`/... and `zz`-ish endings; + // the important property is that it is a sound non-empty condition. + assert_ne!(result, TrigramQuery::None); + } + + #[test] + fn test_plus_requires_inner() { + // `(abc)+` must contain at least one `abc`. + assert_eq!(q("(abc)+"), tri("abc")); + } + + #[test] + fn test_optional_group_is_not_required() { + // `(foo)?bar` -> foo optional, bar required. + assert_eq!(q("(foo)?bar"), tri("bar")); + } + + #[test] + fn test_eval_and_or_with_missing_tokens() { + let foo = ngram_to_token("foo", NGRAM_N); + let bar = ngram_to_token("bar", NGRAM_N); + let mut bitmaps = HashMap::new(); + bitmaps.insert(foo, RoaringTreemap::from_iter([1u64, 2, 3])); + bitmaps.insert(bar, RoaringTreemap::from_iter([2u64, 3, 4])); + // `baz` is absent from the index. + + // AND intersects. + let and = TrigramQuery::and(vec![tri("foo"), tri("bar")]); + assert_eq!( + eval_trigram_query(&and, &bitmaps), + RoaringTreemap::from_iter([2u64, 3]) + ); + + // OR unions. + let or = TrigramQuery::or(vec![tri("foo"), tri("bar")]); + assert_eq!( + eval_trigram_query(&or, &bitmaps), + RoaringTreemap::from_iter([1u64, 2, 3, 4]) + ); + + // A missing token is empty: it zeroes an AND but is harmless in an OR. + let and_missing = TrigramQuery::and(vec![tri("foo"), tri("baz")]); + assert!(eval_trigram_query(&and_missing, &bitmaps).is_empty()); + let or_missing = TrigramQuery::or(vec![tri("foo"), tri("baz")]); + assert_eq!( + eval_trigram_query(&or_missing, &bitmaps), + RoaringTreemap::from_iter([1u64, 2, 3]) + ); + } + + #[test] + fn test_collect_tokens() { + let query = TrigramQuery::and(vec![ + tri("foo"), + TrigramQuery::or(vec![tri("bar"), tri("baz")]), + ]); + let mut tokens = HashSet::new(); + collect_tokens(&query, &mut tokens); + assert_eq!( + tokens, + HashSet::from([ + ngram_to_token("foo", NGRAM_N), + ngram_to_token("bar", NGRAM_N), + ngram_to_token("baz", NGRAM_N), + ]) + ); + } +} diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 74e6faf5c07..440c3fb301a 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -175,6 +175,10 @@ required-features = ["cli"] name = "scalar_index" harness = false +[[bench]] +name = "regex_ngram" +harness = false + [[bench]] name = "merge_insert" harness = false diff --git a/rust/lance/benches/regex_ngram.rs b/rust/lance/benches/regex_ngram.rs new file mode 100644 index 00000000000..76f597ad9cb --- /dev/null +++ b/rust/lance/benches/regex_ngram.rs @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark: regex predicate scans over an ngram-indexed string column. +//! +//! Each query is a `regexp_match(doc, '...')` filter against a dataset that has +//! an NGram index on `doc`. The query set spans a selective AND pattern, an +//! alternation, a plain literal (rewritten to an infix LIKE before it reaches +//! the index), and a deliberately non-accelerable pattern (`a.b`, which yields +//! no trigram) that serves as a regression guard. +//! +//! On `main` none of these use the index (regex falls through to a full scan + +//! recheck); with the ngram-regex acceleration the index prunes candidates for +//! the first three while `a.b` stays a full scan. Capture a baseline on `main` +//! with `--save-baseline before_7130`, then compare after the change with +//! `--baseline before_7130`. + +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +use arrow::array::AsArray; +use arrow_array::{RecordBatch, RecordBatchIterator, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{Criterion, criterion_group, criterion_main}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::index::DatasetIndexExt; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{RowCount, array}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +const TOTAL: usize = 200_000; + +/// Build the `doc` column: random sentences with rare markers injected into a +/// small fraction of rows so the regex queries have controlled selectivity. +/// The markers (`zqxwvu`, `needlexyz`, `qwerasdf`) are unlikely to appear in +/// the generated English-word sentences. +fn build_docs() -> StringArray { + let mut sentence_gen = array::random_sentence(1, 30, false); + let base = sentence_gen + .generate_default(RowCount::from(TOTAL as u64)) + .unwrap(); + let base = base.as_string::(); + let docs = (0..TOTAL).map(|i| { + let sentence = base.value(i); + if i % 200 == 0 { + // ~0.5% of rows match `zqxwvu.*needlexyz` and `zqxwvu`. + format!("{sentence} zqxwvu needlexyz") + } else if i % 211 == 0 { + // A second marker for the alternation query. + format!("{sentence} qwerasdf") + } else { + sentence.to_string() + } + }); + StringArray::from_iter_values(docs) +} + +async fn build_dataset(tempdir: &TempStrDir) -> Arc { + let schema = Arc::new(Schema::new(vec![Field::new("doc", DataType::Utf8, false)])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(build_docs())]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + + let mut dataset = Dataset::write(reader, tempdir.as_str(), None) + .await + .unwrap(); + dataset + .create_index( + &["doc"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + Arc::new(dataset) +} + +async fn scan_filter(dataset: &Dataset, filter: &str) -> usize { + let mut scanner = dataset.scan(); + scanner.filter(filter).unwrap(); + let stream = scanner.try_into_stream().await.unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + batches.iter().map(|b| b.num_rows()).sum() +} + +fn bench_regex_ngram(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let tempdir = TempStrDir::default(); + let dataset = rt.block_on(build_dataset(&tempdir)); + + let queries = [ + ("selective_and", "regexp_match(doc, 'zqxwvu.*needlexyz')"), + ( + "alternation", + "regexp_match(doc, '(zqxwvu|qwerasdf|needlexyz)')", + ), + ("plain_literal", "regexp_match(doc, 'zqxwvu')"), + ("non_accelerable_a_dot_b", "regexp_match(doc, 'a.b')"), + ]; + + let mut group = c.benchmark_group("regex_ngram"); + group + .sample_size(10) + .measurement_time(Duration::from_secs(15)); + for (name, filter) in queries { + group.bench_function(name, |b| { + b.iter(|| black_box(rt.block_on(scan_filter(&dataset, filter)))); + }); + } + group.finish(); +} + +#[cfg(target_os = "linux")] +criterion_group!( + name = benches; + config = Criterion::default() + .significance_level(0.1) + .sample_size(10) + .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_regex_ngram); + +#[cfg(not(target_os = "linux"))] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_regex_ngram); + +criterion_main!(benches); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 1112721bb33..2d75104d26e 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -8412,6 +8412,198 @@ mod test { .unwrap(); } + #[tokio::test] + async fn test_ngram_regex_index_scan() { + use arrow::array::AsArray; + + // A small, fixed corpus written across multiple fragments so the ngram + // index spans fragment boundaries. + let values = [ + "rhino", // 0 + "rhinos nose", // 1 + "cat", // 2 + "dog", // 3 + "cat dog", // 4 + "elephant", // 5 + "catalog", // 6 + "scatter", // 7 + "rhino horn", // 8 + "mouse", // 9 + "category", // 10 + "dogma", // 11 + ]; + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "s", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 4, // 12 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = Dataset::write(reader, "memory://test_ngram_regex", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["s"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + assert!( + dataset.get_fragments().len() > 1, + "expected a multi-fragment dataset" + ); + + // Scan with `filter` and return the matched `s` values, sorted. + async fn matched(dataset: &Dataset, filter: &str) -> Vec { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in batches { + let col = batch.column_by_name("s").unwrap().as_string::(); + out.extend(col.iter().flatten().map(|s| s.to_string())); + } + out.sort(); + out + } + + // `regexp_like`: a plain literal substring. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // `regexp_match` (coerced to `IsNotNull(regexp_match(...))`) accelerates too. + assert_eq!( + matched(&dataset, "regexp_match(s, 'rhino')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Anchored: recheck must drop trigram false positives -- the `cat` + // trigram also occurs in cat dog / catalog / scatter / category. + assert_eq!(matched(&dataset, "regexp_like(s, 'cat$')").await, ["cat"]); + // AND across `.*`: row 8 ("rhino horn") shares the rhino trigrams but + // lacks the nose trigrams, so only "rhinos nose" survives. + assert_eq!( + matched(&dataset, "regexp_like(s, 'rhino.*nose')").await, + ["rhinos nose"] + ); + // Alternation. + assert_eq!( + matched(&dataset, "regexp_like(s, '(catalog|elephant)')").await, + ["catalog", "elephant"] + ); + // A non-accelerable pattern (no trigram derivable) still returns correct + // results via a full recheck. + assert_eq!(matched(&dataset, "regexp_like(s, 'o.m')").await, ["dogma"]); + // A case-insensitive flag is not accelerated (the index normalization + // disagrees with Unicode case folding) but must still return correct + // results via a full recheck -- here matching despite the upper-case + // pattern. This exercises the three-argument `regexp_like` flags path. + assert_eq!( + matched(&dataset, "regexp_like(s, 'RHINO', 'i')").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + + // Infix LIKE is accelerated through the same machinery (a plain-literal + // `regexp_like` is rewritten to LIKE before it reaches the index). + assert_eq!( + matched(&dataset, "s LIKE '%rhino%'").await, + ["rhino", "rhino horn", "rhinos nose"] + ); + // Prefix LIKE: recheck drops "scatter", which contains the `cat` trigram + // but does not start with "cat". + assert_eq!( + matched(&dataset, "s LIKE 'cat%'").await, + ["cat", "cat dog", "catalog", "category"] + ); + + // The ngram index is actually engaged for every accelerated form. + for filter in [ + "regexp_like(s, 'rhino')", + "regexp_match(s, 'rhino')", + "s LIKE '%rhino%'", + ] { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let plan = scan.create_plan().await.unwrap(); + let plan_str = format!( + "{}", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true) + ); + assert!( + plan_str.contains("ScalarIndexQuery") && plan_str.contains("NGram"), + "expected ngram index usage for `{filter}`, got plan:\n{plan_str}" + ); + } + } + + #[tokio::test] + async fn test_ngram_regex_non_accelerable_recheck() { + // `a.b` yields no trigram, so the index returns "recheck everything". + // This must still produce ALL correct matches across fragments, not an + // empty set (a regression test for the AtLeast recheck path, which a + // single-match case would not catch). + let unit = ["acb", "dog", "axb", "cat", "qqq", "rhino"]; + let values: Vec<&str> = unit.iter().copied().cycle().take(60).collect(); + let array = StringArray::from_iter_values(values); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "text", + DataType::Utf8, + false, + )])); + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let write_params = WriteParams { + max_rows_per_file: 20, // 60 rows -> 3 fragments + ..Default::default() + }; + let mut dataset = + Dataset::write(reader, "memory://test_ngram_regex_ne", Some(write_params)) + .await + .unwrap(); + dataset + .create_index( + &["text"], + IndexType::NGram, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + async fn count(dataset: &Dataset, filter: &str) -> usize { + let mut scan = dataset.scan(); + scan.filter(filter).unwrap(); + let batches = scan + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + batches.iter().map(|b| b.num_rows()).sum() + } + + // "acb" and "axb" each appear 10 times in the 60 rows -> 20 matches. + assert_eq!(count(&dataset, "regexp_match(text, 'a.b')").await, 20); + assert_eq!(count(&dataset, "regexp_like(text, 'a.b')").await, 20); + } + #[tokio::test] async fn test_like_prefix_with_btree_index() { // Create dataset with string data that has various prefixes From 27570a380f5f1155d48bbc48d66f2b17b5e37059 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 16 Jun 2026 13:30:03 -0500 Subject: [PATCH 112/177] perf(mem_wal): parallelize fresh-tier source planning and execution (#7257) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary The LSM FTS and vector search planners (`LsmFtsSearchPlanner`, `LsmVectorSearchPlanner`) built each source's plan in a sequential `for` loop and unioned the arms under a single `SortPreservingMergeExec`. The merge polls every union arm from one task, so per-arm CPU (posting/index decode, BM25 and distance scoring) serialized even though the underlying IO awaits interleave — wall time grew linearly with the flushed-generation count. This: - builds the per-source plans concurrently with `try_join_all` (FTS + vector), - runs the cross-source block-list PK hashing concurrently (`block_list.rs`), - wraps the union in a round-robin `RepartitionExec` via a new `spawn_union_arms` helper so each arm gets its own driver task. Rows stay disjoint across partitions, so the per-partition TopK + sort-preserving merge semantics are unchanged. ## Changes - `scanner/exec.rs`: `spawn_union_arms` helper (round-robin repartition over the union). - `scanner/fts_search.rs`, `scanner/vector_search.rs`: concurrent per-source plan builds + `spawn_union_arms` over the union. - `scanner/block_list.rs`: concurrent flushed-generation PK-hash loads. ## Validation Validated end-to-end against a WAL FTS benchmark on minikube with object storage behind a 10ms/GET latency proxy. Read latency over a fresh tier as a function of flushed-generation count, p50: | generations | before | after | |---|---|---| | 2 | 1,164ms | 565ms | | 5 | 1,983ms | 610ms | | 10 | 3,585ms | 660ms | | 18 | 6,071ms | 759ms | Per-generation slope dropped from ~290ms/gen to ~12ms/gen. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Fable 5 --- .../src/dataset/mem_wal/scanner/block_list.rs | 52 ++++++++------ .../src/dataset/mem_wal/scanner/fts_search.rs | 55 ++++++++++----- .../dataset/mem_wal/scanner/vector_search.rs | 67 ++++++++++++------- 3 files changed, 113 insertions(+), 61 deletions(-) diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index 684fde48da1..8a293c3f988 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -47,9 +47,12 @@ pub async fn compute_source_block_lists( flushed_cache: Option<&Arc>, ) -> Result { // Hash each non-base source's membership, grouped by shard (generations are - // per-shard, so supersession is within-shard only). + // per-shard, so supersession is within-shard only). Flushed-generation PK + // scans (cold-cache S3 reads) run concurrently; order is irrelevant — the + // per-shard lists are sorted by generation below. let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; + let mut flushed_loads = Vec::new(); for source in sources { match source { LsmDataSource::BaseTable { .. } => has_base = true, @@ -72,14 +75,20 @@ pub async fn compute_source_block_lists( .. } => { // Cached by immutable path so repeated searches skip the PK scan. - let hashes = flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?; - by_shard - .entry(*shard_id) - .or_default() - .push((*generation, hashes)); + flushed_loads.push(async move { + flushed_pk_hashes(path, pk_columns, session, flushed_cache) + .await + .map(|hashes| (*shard_id, *generation, hashes)) + }); } } } + for (shard_id, generation, hashes) in futures::future::try_join_all(flushed_loads).await? { + by_shard + .entry(shard_id) + .or_default() + .push((generation, hashes)); + } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. @@ -115,22 +124,25 @@ pub async fn fresh_tier_block_list( session: Option<&Arc>, flushed_cache: Option<&Arc>, ) -> Result>>> { - let mut sets = Vec::new(); - for source in sources { - let set = match source { - LsmDataSource::BaseTable { .. } => continue, - LsmDataSource::ActiveMemTable { batch_store, .. } => { - Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?) - } + // Flushed PK scans run concurrently (cold-cache S3 reads); ordered + // try_join_all keeps the source order of the returned sets. + let sets = futures::future::try_join_all(sources.iter().map(|source| async move { + Ok::<_, lance_core::Error>(match source { + LsmDataSource::BaseTable { .. } => None, + LsmDataSource::ActiveMemTable { batch_store, .. } => Some(Arc::new( + pk_hashes_from_batch_store(batch_store, pk_columns)?, + )), LsmDataSource::FlushedMemTable { path, .. } => { - flushed_pk_hashes(path, pk_columns, session, flushed_cache).await? + Some(flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?) } - }; - if !set.is_empty() { - sets.push(set); - } - } - Ok(sets) + }) + })) + .await?; + Ok(sets + .into_iter() + .flatten() + .filter(|set| !set.is_empty()) + .collect()) } /// Hash the PK membership of an in-memory memtable (active or frozen) from its diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index e3ef44d8b1a..626b0effe3c 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -167,23 +167,39 @@ impl LsmFtsSearchPlanner { .await?; let overfetch = self.overfetch_factor.max(1.0); - let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); - for source in &sources { - let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); - let blocked = block_lists.get(&(source.shard_id(), source.generation())); - // Over-fetch a blocked source so the post-filter still yields k live - // rows. The active arm returns all matches (no builder limit), so its - // within-source dedup needs no over-fetch hint. - let fetch_k = if blocked.is_some() { - ((k as f64) * overfetch).ceil() as usize - } else { - k - }; - - let plan = self - .build_source_plan(source, column, &query, fetch_k, projection, is_active) - .await?; + // Stage the per-source over-fetch decisions, then build every source + // plan concurrently — the builds are independent and a sequential loop + // was the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + let blocked = block_lists.get(&(source.shard_id(), source.generation())); + // Over-fetch a blocked source so the post-filter still yields k live + // rows. The active arm returns all matches (no builder limit), so its + // within-source dedup needs no over-fetch hint. + let fetch_k = if blocked.is_some() { + ((k as f64) * overfetch).ceil() as usize + } else { + k + }; + (source, is_active, blocked, fetch_k) + }) + .collect(); + let built = + futures::future::try_join_all(arm_inputs.iter().map( + |(source, is_active, _, fetch_k)| { + Box::pin(self.build_source_plan( + source, column, &query, *fetch_k, projection, *is_active, + )) + }, + )) + .await?; + let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); + for ((_, is_active, blocked, _), plan) in arm_inputs.iter().zip(built) { + let is_active = *is_active; + let blocked = *blocked; // Dedup, mirroring LsmVectorSearchPlanner: // * active: collapse duplicate-PK appends to the newest insert // (larger _rowid = inserted later). The FTS index is append-only, @@ -219,8 +235,11 @@ impl LsmFtsSearchPlanner { per_source_plans.into_iter().next().unwrap() } else { #[allow(deprecated)] - let union: Arc = Arc::new(UnionExec::new(per_source_plans)); - union + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, + // so each arm's per-arm CPU (posting decode, BM25) runs on its own + // task without an extra repartition. + Arc::new(UnionExec::new(per_source_plans)) }; let score_idx = merged.schema().index_of(SCORE_COLUMN).map_err(|_| { diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index b6b1f952b25..878063321aa 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -233,30 +233,47 @@ impl LsmVectorSearchPlanner { // `block_lists` is non-empty exactly when a newer generation exists. let refine_base = refine_base_table || !block_lists.is_empty(); + // Stage per-source over-fetch decisions, then build every KNN plan + // concurrently — the builds are independent and a sequential loop was + // the dominant serial planning cost at multiple generations. + let arm_inputs: Vec<_> = sources + .iter() + .map(|source| { + let generation = source.generation(); + let is_base = matches!(source, LsmDataSource::BaseTable { .. }); + let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); + // Over-fetch when the post-source filter can drop candidates: a + // blocked source loses superseded rows; the active source's + // within-source dedup collapses duplicate-PK HNSW nodes. Block + // lookup is per shard — generations are per-shard. + let blocked = block_lists.get(&(source.shard_id(), generation)); + let fetch_k = if blocked.is_some() || is_active { + ((k as f64) * overfetch_factor).ceil() as usize + } else { + k + }; + (source, is_base, is_active, blocked, fetch_k) + }) + .collect(); + let built = futures::future::try_join_all(arm_inputs.iter().map( + |(source, is_base, _, _, fetch_k)| { + Box::pin(self.build_knn_plan( + source, + query_vector, + *fetch_k, + nprobes, + projection, + *is_base && refine_base, + )) + }, + )) + .await?; + let mut knn_plans = Vec::new(); - for source in &sources { - let generation = source.generation(); - let is_base = matches!(source, LsmDataSource::BaseTable { .. }); - let is_active = matches!(source, LsmDataSource::ActiveMemTable { .. }); - // Over-fetch when the post-source filter can drop candidates: a - // blocked source loses superseded rows; the active source's - // within-source dedup collapses duplicate-PK HNSW nodes. Block - // lookup is per shard — generations are per-shard. - let blocked = block_lists.get(&(source.shard_id(), generation)); - let fetch_k = if blocked.is_some() || is_active { - ((k as f64) * overfetch_factor).ceil() as usize - } else { - k - }; - let knn = Box::pin(self.build_knn_plan( - source, - query_vector, - fetch_k, - nprobes, - projection, - is_base && refine_base, - )) - .await?; + for ((_, is_base, is_active, blocked, _), knn) in arm_inputs.iter().zip(built) { + let is_base = *is_base; + let is_active = *is_active; + let blocked = *blocked; // Make each source independently newest-per-PK before the union: // * active: the append-only HNSW returns one node per inserted // version, so collapse duplicate PKs to the newest insert @@ -301,6 +318,10 @@ impl LsmVectorSearchPlanner { // No cross-source dedup needed (see struct doc): SortExec(per partition) // + SortPreservingMerge does the p-way distance-ordered top-k merge. #[allow(deprecated)] + // The downstream `SortPreservingMergeExec` already spawns one driver + // task per input partition (one per union arm) via `spawn_buffered`, so + // each arm's per-arm CPU (HNSW search, distance refine) runs on its own + // task without an extra repartition. let merged: Arc = Arc::new(UnionExec::new(knn_plans)); let distance_idx = merged.schema().index_of(DISTANCE_COLUMN).map_err(|_| { From bf9f706fb4ebb5605a65c1f499d1cb9a10de996e Mon Sep 17 00:00:00 2001 From: George Stamatakis <126914070+gstamatakis95@users.noreply.github.com> Date: Tue, 16 Jun 2026 20:48:41 +0200 Subject: [PATCH 113/177] fix(python): python binding of `Compaction.commit` (#7210) Closes #7209 - `python/src/dataset/optimize.rs`: `Compaction.commit` gains an optional `options` dict parameter (`#[pyo3(signature = (dataset, rewrites, options = None))]`), parsed through the same `parse_compaction_options` helper that `plan` and `execute` already use, seeded from the dataset's `manifest.config` exactly like `execute`. Absent or `None` preserves the old behavior. The TODO comment is removed. - `python/python/lance/lance/optimize.pyi`: stub updated to `commit(dataset, rewrites, options: Optional[CompactionOptions] = None)`. - `python/python/tests/test_optimize.py`: new parametrized test `test_defer_index_remap_via_commit_options`. With `Compaction.commit(dataset, rewrites, options={"defer_index_remap": True})` a `__lance_frag_reuse` system index appears in `describe_indices()`. Without options it does not (inline remap occurs). Mirrors the adjacent `test_defer_index_remap` setup. --- python/python/lance/lance/optimize.pyi | 6 ++-- python/python/tests/test_optimize.py | 41 ++++++++++++++++++++++++++ python/src/dataset/optimize.rs | 14 +++++++-- 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/python/python/lance/lance/optimize.pyi b/python/python/lance/lance/optimize.pyi index 9a26d23c003..c4b6b6546e6 100644 --- a/python/python/lance/lance/optimize.pyi +++ b/python/python/lance/lance/optimize.pyi @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Optional from lance import LanceDataset from lance.fragment import FragmentMetadata @@ -51,5 +51,7 @@ class Compaction: def plan(dataset: "LanceDataset", options: CompactionOptions) -> CompactionPlan: ... @staticmethod def commit( - dataset: "LanceDataset", rewrites: List[RewriteResult] + dataset: "LanceDataset", + rewrites: List[RewriteResult], + options: Optional[CompactionOptions] = None, ) -> CompactionMetrics: ... diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index ccd889db116..049ce2cc3a5 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -324,6 +324,47 @@ def test_defer_index_remap(tmp_path: Path): assert any(idx.name == "__lance_frag_reuse" for idx in indices) +@pytest.mark.parametrize("use_commit_options", [True, False]) +def test_defer_index_remap_via_commit_options(tmp_path: Path, use_commit_options: bool): + """Compaction.commit respects defer_index_remap passed in options. + + When options={"defer_index_remap": True} is supplied to Compaction.commit + the __lance_frag_reuse system index must appear in describe_indices(). + When the option is omitted (default) no such system index is written. + """ + base_dir = tmp_path / f"dataset_commit_opts_{use_commit_options}" + data = pa.table({"i": range(6_000), "val": range(6_000)}) + dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000) + dataset.create_scalar_index("i", "BTREE") + dataset.delete("i < 500") + + plan = Compaction.plan( + dataset, + options=dict(target_rows_per_fragment=2_000, num_threads=1), + ) + rewrites = [task.execute(dataset) for task in plan.tasks] + + if use_commit_options: + Compaction.commit(dataset, rewrites, options={"defer_index_remap": True}) + else: + Compaction.commit(dataset, rewrites) + + dataset = lance.dataset(base_dir) + indices = dataset.describe_indices() + has_frag_reuse = any(idx.name == "__lance_frag_reuse" for idx in indices) + + if use_commit_options: + assert has_frag_reuse, ( + "expected __lance_frag_reuse system index when defer_index_remap=True " + "is passed to Compaction.commit" + ) + else: + assert not has_frag_reuse, ( + "did not expect __lance_frag_reuse system index when options is omitted " + "from Compaction.commit" + ) + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_describe_indices_matches_list_indices_for_frag_reuse(tmp_path: Path): """describe_indices() and list_indices() must agree on the index_type diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs index 33aa32b94cd..4bb29246f45 100644 --- a/python/src/dataset/optimize.rs +++ b/python/src/dataset/optimize.rs @@ -554,26 +554,34 @@ impl PyCompaction { /// new version once committed. /// rewrites : List[RewriteResult] /// The results of the compaction tasks to include in the commit. + /// options : dict, optional + /// Compaction options to apply at commit time. + /// When absent or ``None``, defaults to ``CompactionOptions::default()``. /// /// Returns /// ------- /// CompactionMetrics #[staticmethod] + #[pyo3(signature = (dataset, rewrites, options = None))] pub fn commit( dataset: Bound, rewrites: Vec, + options: Option>, ) -> PyResult { let dataset_ref = unwrap_dataset(dataset)?; let dataset = dataset_ref.borrow().clone(); + let config = dataset.ds.manifest.config.clone(); + let opts = match options { + Some(ref dict) => parse_compaction_options(dict, &config)?, + None => CompactionOptions::default(), + }; let rewrites: Vec = rewrites.into_iter().map(|r| r.0).collect(); let mut new_ds = dataset.ds.as_ref().clone(); - // TODO: pass compaction option from plan and execute time - let options: CompactionOptions = CompactionOptions::default(); let fut = commit_compaction( &mut new_ds, rewrites, Arc::new(DatasetIndexRemapperOptions::default()), - &options, + &opts, ); let metrics = rt() .block_on(None, fut)? From 8e98f511eb38568d0d68799ae8e6dff581c52b3b Mon Sep 17 00:00:00 2001 From: WenDing-Y <1062698930@qq.com> Date: Wed, 17 Jun 2026 02:52:20 +0800 Subject: [PATCH 114/177] fix(dataset)!: fail-fast casting for columns with attached indices (#7158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING CHANGE: Callers that previously depended on the silent index-drop behavior must now explicitly call `drop_index()` before `alter_columns` with a cast, then rebuild the index afterward. ## Summary Currently, calling `alter_columns` with a cast on a column that has an attached index **silently drops the index** without any warning or error. This has caused production incidents where vector search silently regressed to brute-force scan. This PR adds a **fail-fast check** before any cast is performed: if a target column has any index attached, the operation is rejected with a clear error message that names the affected column(s), the index(es), and the required remediation (`drop_index()` → cast → rebuild index). ## Changes - Added index-awareness check in `alter_columns` that scans cast target columns for attached indices - Returns a descriptive `Error::invalid_input` with column names, index names, and remediation steps when a conflict is detected - Updated existing tests that previously relied on silent index-drop behavior to explicitly `drop_index()` before casting - Added new test `test_alter_columns_cast_fails_with_attached_index` covering the IVF_PQ index path: verifies the error message, schema integrity, index preservation, and that the cast succeeds after a manual drop --- python/python/tests/test_vector_index.py | 2 + rust/lance/src/dataset/schema_evolution.rs | 162 ++++++++++++++++++++- 2 files changed, 160 insertions(+), 4 deletions(-) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 292b8079706..4e3addfedb8 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -1772,6 +1772,8 @@ def test_index_cast_centroids(tmp_path): values = pa.array([x for arr in centroids for x in arr], pa.float32()) centroids = pa.FixedSizeListArray.from_arrays(values, 128) + # Cast invalidates the attached index; drop it first per the new contract. + dataset.drop_index(index_name) dataset.alter_columns(dict(path="vector", data_type=pa.list_(pa.float16(), 128))) # centroids are f32, but the column is now f16 diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index a2c8f05a89f..6ffa7b45396 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -12,6 +12,7 @@ use super::{ transaction::{Operation, Transaction}, write::cleanup_data_fragments, }; +use crate::index::DatasetIndexExt; use crate::{Error, Result, io::exec::Planner}; use arrow::compute::CastOptions; use arrow::compute::can_cast_types; @@ -736,6 +737,41 @@ pub(super) async fn alter_columns( new_schema.validate()?; + // If any column being cast has an attached index, fail fast. Cast operations + // rewrite the underlying column data and silently invalidate any index on the + // affected column(s). The current behavior is to drop such indices without + // warning, which has caused production incidents where vector search silently + // regressed to brute-force scan. We require users to explicitly drop the + // index before altering the column type, so the action is never silent. + if !cast_fields.is_empty() { + let indices = dataset.load_indices().await?; + let affected: Vec<&lance_table::format::IndexMetadata> = indices + .iter() + .filter(|idx| { + cast_fields + .iter() + .any(|(old, _)| idx.fields.contains(&old.id)) + }) + .collect(); + if !affected.is_empty() { + let affected_cols: Vec = cast_fields + .iter() + .filter(|(old, _)| affected.iter().any(|i| i.fields.contains(&old.id))) + .map(|(old, _)| old.name.clone()) + .collect(); + let affected_idx_names: Vec = affected.iter().map(|i| i.name.clone()).collect(); + return Err(Error::invalid_input(format!( + "Cannot cast column(s) [{}] to a new type: they have {} index(es) \ + attached: [{}]. Cast rewrites column data and invalidates any index \ + on the affected column(s). Drop the index(es) with drop_index() \ + before altering, then recreate them after the cast completes.", + affected_cols.join(", "), + affected.len(), + affected_idx_names.join(", "), + ))); + } + } + // If we aren't casting a column, we don't need to touch the fragments. let transaction = if cast_fields.is_empty() { Transaction::new( @@ -2574,7 +2610,6 @@ mod test { ) -> Result<()> { // Create a table with 2 scalar columns, 1 vector column - use crate::index::DatasetIndexExt; use arrow::datatypes::{Int32Type, Int64Type}; use arrow_array::{Float16Array, Float32Array, Int64Array, ListArray}; use half::f16; @@ -2675,7 +2710,10 @@ mod test { assert_eq!(f.files.len(), 2); }); - // Cast scalar column with index, should not keep index (TODO: keep it) + // Cast scalar column with index. The index must be dropped first; cast + // is now a fail-fast operation when an index is attached, see + // test_alter_columns_cast_fails_with_attached_index for that path. + dataset.drop_index("i_idx").await?; dataset .alter_columns(&[ColumnAlteration::new("i".into()).cast_to(DataType::Int64)]) .await?; @@ -2696,7 +2734,8 @@ mod test { ]); assert_eq!(&ArrowSchema::from(dataset.schema()), &expected_schema); - // We currently lose the index when casting a column + // The scalar index on `i` is gone (we dropped it); the vector index on + // `vec` is still present. let indices = dataset.load_indices().await?; assert_eq!(indices.len(), 1); @@ -2705,7 +2744,8 @@ mod test { assert_eq!(f.files.len(), 3); }); - // Cast vector column, should not keep index (TODO: keep it) + // Cast vector column. Drop its index first (same reason as above). + dataset.drop_index("vec_idx").await?; dataset .alter_columns(&[ ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( @@ -2773,6 +2813,120 @@ mod test { Ok(()) } + /// Cast on a column with an attached index must fail fast rather than + /// silently dropping the index. This guards against the historical behavior + /// where cast would rewrite column data and the index would vanish without + /// any error or warning, causing vector search to silently regress to a + /// brute-force scan. + #[rstest] + #[tokio::test] + async fn test_alter_columns_cast_fails_with_attached_index( + #[values(LanceFileVersion::Legacy, LanceFileVersion::Stable)] + data_storage_version: LanceFileVersion, + ) -> Result<()> { + use lance_arrow::FixedSizeListArrayExt; + use lance_index::IndexType; + use lance_linalg::distance::MetricType; + use lance_testing::datagen::generate_random_array; + + use crate::index::vector::VectorIndexParams; + + // Build a small dataset with one indexed vector column. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + false, + )])); + let nrows = 256; + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new( + ::try_new_from_values( + generate_random_array(64 * nrows as usize), + 64, + ) + .unwrap(), + )], + )?; + + let test_dir = TempStrDir::default(); + let mut dataset = Dataset::write( + RecordBatchIterator::new(vec![Ok(batch)], schema.clone()), + &test_dir, + Some(WriteParams { + data_storage_version: Some(data_storage_version), + ..Default::default() + }), + ) + .await?; + + // Build an IVF_PQ index on the vector column. + let params = VectorIndexParams::ivf_pq(4, 8, 8, MetricType::L2, 50); + dataset + .create_index(&["vec"], IndexType::Vector, None, ¶ms, false) + .await?; + + let indices_before = dataset.load_indices().await?; + assert_eq!(indices_before.len(), 1, "precondition: index exists"); + let index_name = indices_before[0].name.clone(); + + // Attempting to cast the indexed column must fail with a clear message + // that names the offending index(es). + let result = dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await; + let err = result.expect_err("cast on indexed column should fail"); + let msg = err.to_string(); + assert!( + msg.contains("vec") && msg.contains(&index_name), + "error should mention column and index name, got: {msg}" + ); + assert!( + msg.contains("drop_index"), + "error should suggest the remediation, got: {msg}" + ); + + // The dataset must be unchanged: schema is still float32, index still present. + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 64, + ), + ); + let indices_after = dataset.load_indices().await?; + assert_eq!(indices_after.len(), 1, "index should still exist"); + assert_eq!(indices_after[0].name, index_name); + + // Sanity check: after dropping the index, the same cast should succeed. + dataset.drop_index(&index_name).await?; + dataset + .alter_columns(&[ + ColumnAlteration::new("vec".into()).cast_to(DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + )), + ]) + .await?; + assert_eq!( + dataset.schema().field("vec").unwrap().data_type(), + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float16, true)), + 64, + ), + ); + + Ok(()) + } + #[rstest] #[tokio::test] async fn test_drop_columns( From 80e35ecfb93f49ccbc2adf678b089312be309136 Mon Sep 17 00:00:00 2001 From: Justin Miller Date: Tue, 16 Jun 2026 13:52:29 -0500 Subject: [PATCH 115/177] feat: bump lance-namespace-reqwest-client to 0.8.6 (source_task_size) (#7254) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Linear: [GEN-476](https://linear.app/lancedb/issue/GEN-476) lance-namespace [0.8.6](https://github.com/lance-format/lance-namespace/releases) added an optional `source_task_size` field to `RefreshMaterializedViewRequest` (chunker materialized-view refresh work-item size, bounds per-actor memory). The reqwest client in this repo was pinned at `0.8.4`, so the field was dropped when `RestNamespace` deserialized the request — never reaching the server. ## Change Bumps `lance-namespace-reqwest-client` `0.8.4` → `0.8.6` (workspace `Cargo.toml` + `Cargo.lock`). Additive and **code-free**: the PyO3 binding (`python/src/namespace.rs`) `depythonize`s the typed `RefreshMaterializedViewRequest`, so the new field flows automatically once the crate carries it. ## Test plan - `cargo update -p lance-namespace-reqwest-client` → `v0.8.6` - `cargo check -p lance-namespace` → clean (builds against reqwest-client 0.8.6) - *(Local full build of the AWS-credential-vendor path hits a pre-existing `aws-smithy-types` E0119 under rustc 1.91, unrelated to this bump — CI's pinned toolchain builds it.)* ## Follow-up Publish a pylance beta from this so geneva can pin `lance-namespace>=0.8.6` + the new pylance and have `source_task_size` transit `db://` end-to-end. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 Co-authored-by: Will Jones --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a95f530de38..657d9fde72c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,7 +72,7 @@ lance-linalg = { version = "=8.0.0-beta.14", path = "./rust/lance-linalg" } lance-namespace = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } -lance-namespace-reqwest-client = "0.8.4" +lance-namespace-reqwest-client = "0.8.6" lance-select = { version = "=8.0.0-beta.14", path = "./rust/lance-select" } lance-tokenizer = { version = "=8.0.0-beta.14", path = "./rust/lance-tokenizer" } lance-table = { version = "=8.0.0-beta.14", path = "./rust/lance-table" } From 1b78086bbaa51b81a342522f6af0d879c26070be Mon Sep 17 00:00:00 2001 From: zhangyang <33365824+zhangyang0418@users.noreply.github.com> Date: Wed, 17 Jun 2026 02:58:34 +0800 Subject: [PATCH 116/177] fix: handle empty batch from deletion in add_columns_from_stream (#7233) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Fixes a panic in `add_columns` when a fragment's deleted rows fill an entire read batch. Closes #7232. ## Root cause In `add_columns_from_stream`, the updater reads each fragment in physical batches and applies the deletion vector via `filter_record_batch`. When **every row in a physical batch is deleted**, the filter yields a **0-row batch**. The outer loop then sets `rows_remaining = 0`, the inner `while rows_remaining > 0` loop never runs, `batches` stays empty, and `concat_batches(&batches[0].schema(), ..)` panics with `index out of bounds: the len is 0 but the index is 0`. This was hit in production using a merge-columns workflow over a dataset whose fragment had a deletion file, where the deleted rows happened to align with a read-batch boundary. ## Fix When the updater yields a 0-row batch, feed an empty batch back to the updater (to keep it in sync) and continue: ```rust if rows_remaining == 0 { updater.update(RecordBatch::new_empty(stream.schema())).await?; continue; } ``` ## Test Adds `test_add_columns_with_fully_deleted_batch`: writes a single fragment with 105 rows, deletes the trailing 5 rows so that — read with `batch_size=50` — the last batch `[100..105)` is fully deleted, then verifies `add_columns` succeeds and the new column has the correct values. The test panics on the unpatched code and passes with the fix. Co-authored-by: 张杨 Co-authored-by: Will Jones --- rust/lance/src/dataset/schema_evolution.rs | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/rust/lance/src/dataset/schema_evolution.rs b/rust/lance/src/dataset/schema_evolution.rs index 6ffa7b45396..5ef35a33ab7 100644 --- a/rust/lance/src/dataset/schema_evolution.rs +++ b/rust/lance/src/dataset/schema_evolution.rs @@ -616,6 +616,17 @@ async fn add_columns_from_stream( debug_assert_eq!(batch.num_columns(), 1); let mut rows_remaining = batch.num_rows(); + // The updater yields an empty batch when every row in a read batch + // has been deleted (e.g. a whole batch falls within the deletion + // vector). There is nothing to pull from the stream in that case, so + // feed an empty batch back to keep the updater in sync and continue. + if rows_remaining == 0 { + updater + .update(RecordBatch::new_empty(stream.schema())) + .await?; + continue; + } + let mut batches = Vec::new(); while rows_remaining > 0 { @@ -1073,6 +1084,69 @@ mod test { Ok(()) } + #[tokio::test] + async fn test_add_columns_with_fully_deleted_batch() -> Result<()> { + // Regression test: when an entire read batch has been deleted, the + // updater yields a 0-row batch. The inner loop then never runs and + // `batches` stays empty, so `concat_batches(&batches[0]..)` used to + // panic with "index out of bounds: the len is 0 but the index is 0". + // + // A single fragment holds 105 rows; deleting the trailing 5 rows means + // that, when read with batch_size=50, the third batch [100..105) is + // fully filtered out and produces an empty batch. + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "i", + DataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..105))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + + let test_dir = TempStrDir::default(); + let test_uri = &test_dir; + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 200, // keep all rows in a single fragment + ..Default::default() + }), + ) + .await?; + + // Delete the entire trailing batch [100..105). + dataset.delete("i >= 100").await?; + assert_eq!(dataset.count_rows(None).await?, 100); + + let new_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "j", + DataType::Int32, + false, + )])); + let new_batch = RecordBatch::try_new( + new_schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(0..100))], + )?; + let reader = RecordBatchIterator::new(vec![Ok(new_batch)], new_schema.clone()); + + // Read with batch_size=50 so the deleted trailing rows form a full empty batch. + dataset + .add_columns(NewColumnTransform::Reader(Box::new(reader)), None, Some(50)) + .await?; + + let data = dataset.scan().try_into_batch().await?; + assert_eq!(data.num_rows(), 100); + assert_eq!( + data.column_by_name("j").unwrap().as_ref(), + &Int32Array::from_iter_values(0..100) + ); + + Ok(()) + } + #[rstest] #[tokio::test] async fn test_add_columns_cleans_up_blob_v2_data_on_stream_error( From 1f98d89507dfa5f4f16f0ed33bd55efc8d140110 Mon Sep 17 00:00:00 2001 From: Armaan Sandhu <74664101+Ar-maan05@users.noreply.github.com> Date: Wed, 17 Jun 2026 00:44:58 +0530 Subject: [PATCH 117/177] fix: merge_insert silently drops matches when a leading payload column is all-null (#7251) fix: merge_insert silently drops matches when a leading payload column is all-null ## Problem A partial-schema `merge_insert` (`when_matched_update_all`) against a table that has a scalar index on the join key can silently update **0 rows**, no error, no warning, when the first column of the source is all-null. Dropping the index makes it work again. Reported as lancedb/lancedb#3515 (and the related lancedb/lancedb#3177). Minimal repro (from the lancedb issue): ```python schema = pa.schema([ pa.field("vector", pa.list_(pa.float32(), 4), nullable=True), # all None pa.field("path", pa.string(), nullable=False), # join key pa.field("status", pa.utf8()), pa.field("file_size", pa.int64()), ]) tbl = db.create_table("test", schema=schema) tbl.add(...) # 1000 rows, vector = None tbl.create_scalar_index("path", index_type="BTREE") tbl.merge_insert("path").when_matched_update_all().execute(updates) # 128 rows # -> num_updated_rows == 0 (expected 128) ``` ## Root cause A scalar index on the join key routes the merge through the legacy `Merger` (see `can_use_create_plan`: `would_use_scalar_index` disables the v2 fast path). The `Merger` reads a full-outer-join stream and, for each row, decides whether the row came from the source side, the target side, or both, by checking whether the join **keys** are NULL-padded. But `extract_selections` checked the columns at positions `[0, num_keys)` instead of the actual key columns: ```rust let in_left = Self::not_all_null(combined_batch, 0, num_keys)?; let in_right = Self::not_all_null(combined_batch, right_offset, num_keys)?; ``` This assumes the key columns are physically first. They are not: a partial-schema source preserves the user's column order, so here column 0 is `vector`. On the target side that column is all-null (the original rows were inserted with `vector = None`), so `in_right` was `false` for **every matched row** -> `in_both` empty -> 0 updates, silently. The existing full-schema indexed test only passed by luck: its column 0 happened to be non-null on both sides. ## Fix Locate the join-key columns by name and test those (the target half carries the same columns in the same order, offset by `right_offset`): ```rust let source_key_cols = self.params.on.iter() .map(|key| combined_batch.schema().index_of(key))...; let target_key_cols = source_key_cols.iter().map(|c| c + right_offset)...; let in_left = Self::not_all_null(combined_batch, &source_key_cols)?; let in_right = Self::not_all_null(combined_batch, &target_key_cols)?; ``` `not_all_null` now takes an explicit column-index slice instead of a contiguous `(offset, len)` range. ## Tests Added `test_repro_3515_partial_schema_fully_indexed`, parameterized over storage versions V2_0 / V2_1 / V2_2, mirroring the issue (all-null leading vector column, scalar index covering every fragment, partial-schema update). It fails on `main` (0 updates) and passes with the fix. All 143 tests in the `merge_insert` module pass; `cargo fmt --all --check` and `cargo clippy -p lance` are clean. --- rust/lance/src/dataset/write/merge_insert.rs | 160 +++++++++++++++++-- 1 file changed, 147 insertions(+), 13 deletions(-) diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index 1f3414db4f8..b14421c963f 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -2224,18 +2224,13 @@ impl Merger { &self.output_schema } - // Retrieves a bitmap of rows where at least one of the columns in the range - // col_offset..coll_offset+num_cols is not null. - // - fn not_all_null( - batch: &RecordBatch, - col_offset: usize, - num_cols: usize, - ) -> Result { + // Retrieves a bitmap of rows where at least one of the given columns is + // not null. + fn not_all_null(batch: &RecordBatch, cols: &[usize]) -> Result { // For our purposes we know there is always at least 1 on key - debug_assert_ne!(num_cols, 0); - let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(col_offset))?; - for idx in col_offset + 1..col_offset + num_cols { + debug_assert!(!cols.is_empty()); + let mut at_least_one_valid = arrow::compute::is_not_null(batch.column(cols[0]))?; + for &idx in &cols[1..] { let is_valid = arrow::compute::is_not_null(batch.column(idx))?; at_least_one_valid = arrow::compute::or(&at_least_one_valid, &is_valid)?; } @@ -2263,8 +2258,37 @@ impl Merger { right_offset: usize, num_keys: usize, ) -> Result<(BooleanArray, BooleanArray, BooleanArray)> { - let in_left = Self::not_all_null(combined_batch, 0, num_keys)?; - let in_right = Self::not_all_null(combined_batch, right_offset, num_keys)?; + // The outer join distinguishes its three cases by which side's join + // keys were NULL-padded: a present row always has non-null keys, while + // the absent side is filled with NULLs. We therefore test the *key* + // columns, located by name. They are NOT necessarily the first + // `num_keys` columns — a partial-schema source can place a payload + // column (e.g. an all-null vector) at position 0, and checking + // positions [0, num_keys) there misreads an all-null leading payload + // column as an absent join side, silently dropping every matched row + // (https://github.com/lancedb/lancedb/issues/3515). The target half + // carries the same columns in the same order, offset by `right_offset`. + let source_key_cols = self + .params + .on + .iter() + .map(|key| { + combined_batch.schema().index_of(key).map_err(|_| { + Error::internal(format!( + "merge insert key column '{}' not found in joined batch", + key + )) + }) + }) + .collect::>>()?; + debug_assert_eq!(source_key_cols.len(), num_keys); + let target_key_cols = source_key_cols + .iter() + .map(|c| c + right_offset) + .collect::>(); + + let in_left = Self::not_all_null(combined_batch, &source_key_cols)?; + let in_right = Self::not_all_null(combined_batch, &target_key_cols)?; let in_both = arrow::compute::and(&in_left, &in_right)?; let left_only = arrow::compute::and(&in_left, &arrow::compute::not(&in_right)?)?; let right_only = arrow::compute::and(&arrow::compute::not(&in_left)?, &in_right)?; @@ -3517,6 +3541,116 @@ mod tests { } } + /// Reproduces https://github.com/lancedb/lancedb/issues/3515: + /// a partial-schema `merge_insert` with a scalar index on the join key, + /// where every fragment is covered by the index (no unindexed data), + /// silently updates 0 rows instead of the expected matches. + #[rstest::rstest] + #[tokio::test] + async fn test_repro_3515_partial_schema_fully_indexed( + #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1, LanceFileVersion::V2_2)] + version: LanceFileVersion, + ) { + const N: usize = 1000; + const UPD: usize = 128; + let vec_field = Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4), + true, + ); + let full_schema = Arc::new(Schema::new(vec![ + vec_field.clone(), + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + Field::new("file_size", DataType::Int64, true), + ])); + + // 1000 rows: vector all-null, path "/img/{i}.jpg", status "pending". + let paths = StringArray::from((0..N).map(|i| format!("/img/{i}.jpg")).collect::>()); + let statuses = StringArray::from(vec!["pending"; N]); + let file_sizes = Int64Array::from((0..N as i64).map(|i| 1000 + i).collect::>()); + let null_vectors = arrow_array::new_null_array(vec_field.data_type(), N); + let batch = RecordBatch::try_new( + full_schema.clone(), + vec![ + null_vectors, + Arc::new(paths), + Arc::new(statuses), + Arc::new(file_sizes), + ], + ) + .unwrap(); + + let mut ds = Dataset::write( + RecordBatchIterator::new([Ok(batch)], full_schema.clone()), + "memory://", + Some(WriteParams { + data_storage_version: Some(version), + ..Default::default() + }), + ) + .await + .unwrap(); + + // Scalar index on the merge key, covering every fragment. + ds.create_index( + &["path"], + IndexType::Scalar, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + let ds = Arc::new(ds); + + // Partial-schema source (no `file_size`): update the first 128 rows. + let upd_schema = Arc::new(Schema::new(vec![ + vec_field, + Field::new("path", DataType::Utf8, false), + Field::new("status", DataType::Utf8, true), + ])); + let upd_paths = StringArray::from( + (0..UPD) + .map(|i| format!("/img/{i}.jpg")) + .collect::>(), + ); + let upd_vectors = + FixedSizeListArray::try_new_from_values(Float32Array::from(vec![0.1f32; 4 * UPD]), 4) + .unwrap(); + let upd_statuses = StringArray::from(vec!["indexed"; UPD]); + let updates = RecordBatch::try_new( + upd_schema.clone(), + vec![ + Arc::new(upd_vectors), + Arc::new(upd_paths), + Arc::new(upd_statuses), + ], + ) + .unwrap(); + + let (ds, stats) = MergeInsertBuilder::try_new(ds.clone(), vec!["path".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .when_not_matched(WhenNotMatched::DoNothing) + .try_build() + .unwrap() + .execute_reader(RecordBatchIterator::new([Ok(updates)], upd_schema)) + .await + .unwrap(); + + assert_eq!( + stats.num_updated_rows, UPD as u64, + "expected {UPD} updated rows on {version:?}, got {}", + stats.num_updated_rows + ); + let n_indexed = ds + .count_rows(Some("status = 'indexed'".to_string())) + .await + .unwrap(); + assert_eq!(n_indexed, UPD, "expected {UPD} rows flipped to 'indexed'"); + } + #[tokio::test] async fn test_indexed_merge_insert() { let test_dir = TempStrDir::default(); From e60f5696fd92a94e531c25115febc8554bbe7764 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Tue, 16 Jun 2026 20:37:11 +0000 Subject: [PATCH 118/177] chore: release beta version 8.0.0-beta.15 --- .bumpversion.toml | 2 +- Cargo.lock | 178 +++++++++++++++++++------------------- Cargo.toml | 44 +++++----- java/lance-jni/Cargo.lock | 45 +++++----- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 156 ++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 216 insertions(+), 215 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 91adc6c59c1..61131e9d8b8 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.14" +current_version = "8.0.0-beta.15" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 5bee7197f24..ac919e25a1d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -460,7 +460,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -471,7 +471,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1225,7 +1225,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1419,7 +1419,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1908,7 +1908,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1921,7 +1921,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1943,7 +1943,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1954,7 +1954,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2410,7 +2410,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2693,7 +2693,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2713,7 +2713,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core 0.20.2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2775,7 +2775,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2900,7 +2900,7 @@ checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3155,7 +3155,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3415,7 +3415,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4252,7 +4252,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4297,7 +4297,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4316,7 +4316,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4388,7 +4388,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "all_asserts", "approx", @@ -4491,7 +4491,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4539,7 +4539,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrayref", "paste", @@ -4548,7 +4548,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4588,7 +4588,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4621,7 +4621,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4640,16 +4640,16 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -4694,7 +4694,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "all_asserts", "arrow", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -4759,7 +4759,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "datafusion", "geo-traits", @@ -4773,7 +4773,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "approx", "arc-swap", @@ -4850,7 +4850,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-arith", @@ -4898,7 +4898,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "approx", "arrow-array", @@ -4917,7 +4917,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "async-trait", @@ -4929,7 +4929,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-schema", @@ -4945,7 +4945,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -5009,7 +5009,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -5027,7 +5027,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -5073,16 +5073,16 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-testing" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-schema", @@ -5095,7 +5095,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5107,7 +5107,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "clap", "lance-core", @@ -5540,7 +5540,7 @@ dependencies = [ "cfg-if 1.0.4", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5582,7 +5582,7 @@ checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5614,7 +5614,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5807,7 +5807,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6247,7 +6247,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6567,7 +6567,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6760,7 +6760,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6825,7 +6825,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] @@ -6839,7 +6839,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6877,7 +6877,7 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7209,7 +7209,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7567,7 +7567,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7643,7 +7643,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.117", + "syn 2.0.118", "unicode-ident", ] @@ -7880,7 +7880,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7972,7 +7972,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7983,7 +7983,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8018,7 +8018,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8030,7 +8030,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8074,7 +8074,7 @@ dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8125,7 +8125,7 @@ checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8286,7 +8286,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8377,7 +8377,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8460,7 +8460,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8472,7 +8472,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8495,7 +8495,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -8548,9 +8548,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -8574,7 +8574,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8662,7 +8662,7 @@ checksum = "c26ef8b00e4d382e59f6a8ddb3cd790b3a5bb29f21a358a9a69ea2f29f13f27b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8671,7 +8671,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "944ad38adcbb71eaa682c56bceeb079e4ca82b4b3edc2a0fde5cb297b77dac8d" dependencies = [ - "syn 2.0.117", + "syn 2.0.118", "test-log-core", ] @@ -8701,7 +8701,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8712,7 +8712,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8878,7 +8878,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9110,7 +9110,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9248,7 +9248,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -9266,7 +9266,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -9552,7 +9552,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] @@ -9769,7 +9769,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9780,7 +9780,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -10070,7 +10070,7 @@ dependencies = [ "heck", "indexmap 2.14.0", "prettyplease", - "syn 2.0.117", + "syn 2.0.118", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -10086,7 +10086,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -10352,7 +10352,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -10373,7 +10373,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -10393,7 +10393,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -10435,7 +10435,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 657d9fde72c..5a888fdd1cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.14", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.14", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.14", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.14", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.14", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.14", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.14", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.14", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.14", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.14", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.14", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.14", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.14", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.15", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.15", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.15", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.15", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.15", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.15", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.15", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.15", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.15", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.15", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.15", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.15", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.15", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.15", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.14", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.14", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.14", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.14", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.14", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.15", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.15", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.15", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.15", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.15", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.14", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.15", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.14", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.15", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index ea9a0c0848f..39121f92a2c 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3673,7 +3673,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arc-swap", "arrow", @@ -3746,7 +3746,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -3788,7 +3788,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrayref", "paste", @@ -3797,7 +3797,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -3835,7 +3835,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -3867,7 +3867,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "proc-macro2", "quote", @@ -3893,7 +3893,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -3928,7 +3928,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -3958,7 +3958,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "datafusion", "geo-traits", @@ -3972,7 +3972,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arc-swap", "arrow", @@ -4027,6 +4027,7 @@ dependencies = [ "rand_distr", "rangemap", "rayon", + "regex-syntax", "roaring", "serde", "serde_json", @@ -4039,7 +4040,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-arith", @@ -4080,7 +4081,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4116,7 +4117,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4131,7 +4132,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "async-trait", @@ -4143,7 +4144,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-ipc", @@ -4192,7 +4193,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4207,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4244,7 +4245,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -6847,9 +6848,9 @@ checksum = "a7973cce6668464ea31f176d85b13c7ab3bba2cb3b77a2ed26abd7801688010a" [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index deb97874e82..c759a59858b 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 61bf90facf2..fc109711a07 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.14 + 8.0.0-beta.15 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 37035be3085..5275411197f 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -508,7 +508,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -519,7 +519,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1186,7 +1186,7 @@ checksum = "89385e82b5d1821d2219e0b095efa2cc1f246cbf99080f3be46a1a85c0d392d9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1347,7 +1347,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1742,7 +1742,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1755,7 +1755,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1766,7 +1766,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -1777,7 +1777,7 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2294,7 +2294,7 @@ checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2568,7 +2568,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2578,7 +2578,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2634,7 +2634,7 @@ checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2936,7 +2936,7 @@ checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3193,7 +3193,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3939,7 +3939,7 @@ checksum = "782d32378dddf207193ac91cefb848ad41abb58195c95168e1291227a0832b47" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -3984,7 +3984,7 @@ dependencies = [ "quote", "rustc_version", "simd_cesu8", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4003,7 +4003,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -4075,7 +4075,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arc-swap", "arrow", @@ -4149,7 +4149,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4191,7 +4191,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrayref", "paste", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4270,7 +4270,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4287,16 +4287,16 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "lance-encoding" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -4331,7 +4331,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-arith", "arrow-array", @@ -4361,7 +4361,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "datafusion", "geo-traits", @@ -4375,7 +4375,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arc-swap", "arrow", @@ -4444,7 +4444,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-arith", @@ -4485,7 +4485,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4500,7 +4500,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "async-trait", @@ -4512,7 +4512,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-ipc", @@ -4561,7 +4561,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow-array", "arrow-buffer", @@ -4576,7 +4576,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "arrow", "arrow-array", @@ -4615,7 +4615,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5035,7 +5035,7 @@ checksum = "4568f25ccbd45ab5d5603dc34318c1ec56b117531781260002151b8530a9f931" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5180,7 +5180,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5857,7 +5857,7 @@ checksum = "c96395f0a926bc13b1c17622aaddda1ecb55d49c8f1bf9777e4d877800a43f8b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -5964,7 +5964,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6010,7 +6010,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.118", "tempfile", ] @@ -6024,7 +6024,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6053,12 +6053,12 @@ checksum = "7347867d0a7e1208d93b46767be83e2b8f978c3dad35f775ac8d8847551d6fe1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] name = "pylance" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" dependencies = [ "alloc-stdlib", "arrow", @@ -6149,7 +6149,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6162,7 +6162,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6459,7 +6459,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -6808,7 +6808,7 @@ checksum = "5d2ed0b54125315fb36bd021e82d314d1c126548f871634b483f46b31d13cac6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7073,7 +7073,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7165,7 +7165,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7176,7 +7176,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7211,7 +7211,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7223,7 +7223,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7267,7 +7267,7 @@ dependencies = [ "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7443,7 +7443,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7508,7 +7508,7 @@ checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7579,7 +7579,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7591,7 +7591,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7614,7 +7614,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.117", + "syn 2.0.118", "typify", "walkdir", ] @@ -7644,9 +7644,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" dependencies = [ "proc-macro2", "quote", @@ -7670,7 +7670,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7765,7 +7765,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7776,7 +7776,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -7899,7 +7899,7 @@ checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8121,7 +8121,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8260,7 +8260,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.118", "thiserror 2.0.18", "unicode-ident", ] @@ -8278,7 +8278,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.117", + "syn 2.0.118", "typify-impl", ] @@ -8508,7 +8508,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wasm-bindgen-shared", ] @@ -8716,7 +8716,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8727,7 +8727,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -8985,7 +8985,7 @@ dependencies = [ "heck", "indexmap 2.14.0", "prettyplease", - "syn 2.0.117", + "syn 2.0.118", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -9001,7 +9001,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -9261,7 +9261,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -9282,7 +9282,7 @@ checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] @@ -9302,7 +9302,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", "synstructure", ] @@ -9344,7 +9344,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.118", ] [[package]] diff --git a/python/Cargo.toml b/python/Cargo.toml index f6a3c67381d..3bfdc548fca 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.14" +version = "8.0.0-beta.15" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 2ced0a74fd0b403584611b7f610d74e18d8511cf Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 16 Jun 2026 17:09:48 -0700 Subject: [PATCH 119/177] feat: expose session cache key inventory (#7298) ## Summary - expose optional cache key inventory on cache backends and LanceCache - implement Moka key snapshot support - add Session helpers for index and metadata cache keys --- rust/lance-core/src/cache/backend.rs | 12 +++ rust/lance-core/src/cache/mod.rs | 144 ++++++++++++++++++++++++++- rust/lance-core/src/cache/moka.rs | 9 +- rust/lance/src/session.rs | 92 ++++++++++++++++- 4 files changed, 252 insertions(+), 5 deletions(-) diff --git a/rust/lance-core/src/cache/backend.rs b/rust/lance-core/src/cache/backend.rs index 237254c464f..9307868f399 100644 --- a/rust/lance-core/src/cache/backend.rs +++ b/rust/lance-core/src/cache/backend.rs @@ -22,6 +22,9 @@ use super::CacheCodec; /// A type-erased cache entry. pub type CacheEntry = Arc; +/// Iterator over cache keys currently known to a backend. +pub type CacheKeyIterator<'a> = Box + Send + 'a>; + /// Structured cache key passed to [`CacheBackend`] methods. /// /// CacheBackend impls receive these ready-made from [`LanceCache`](super::LanceCache) @@ -116,6 +119,15 @@ pub trait CacheBackend: Send + Sync + std::fmt::Debug { /// Remove all entries. async fn clear(&self); + /// Return an iterator over cache keys currently known to this backend. + /// + /// Backends that cannot enumerate keys cheaply or accurately should return + /// `None`. An empty iterator means key inventory is supported and the + /// cache currently has no entries. + async fn keys(&self) -> Option> { + None + } + /// Number of entries currently stored (may flush pending operations). async fn num_entries(&self) -> usize; diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs index f62837fe3cc..bea700cad90 100644 --- a/rust/lance-core/src/cache/mod.rs +++ b/rust/lance-core/src/cache/mod.rs @@ -49,7 +49,7 @@ pub mod backend; pub mod codec; mod moka; -pub use backend::{CacheBackend, CacheEntry, InternalCacheKey}; +pub use backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; pub use codec::{CacheCodec, CacheCodecImpl}; pub use moka::MokaCacheBackend; @@ -245,6 +245,40 @@ impl LanceCache { self.cache.size_bytes().await } + /// Return an iterator over keys currently stored under this cache's prefix. + /// + /// Returns `None` when the backend does not support key inventory. The + /// iterator is intended for diagnostics and may be weakly consistent with + /// concurrent cache mutations. + /// + /// # Examples + /// + /// ``` + /// # use std::{borrow::Cow, sync::Arc}; + /// # use lance_core::cache::{CacheKey, LanceCache}; + /// # struct MyKey; + /// # impl CacheKey for MyKey { + /// # type ValueType = Vec; + /// # fn key(&self) -> Cow<'_, str> { Cow::Borrowed("my-key") } + /// # fn type_name() -> &'static str { "VecI32" } + /// # } + /// # async fn example() { + /// let cache = LanceCache::with_capacity(1024); + /// cache.insert_with_key(&MyKey, Arc::new(vec![1, 2, 3])).await; + /// + /// let mut keys = cache.keys().await.expect("Moka supports key inventory"); + /// assert_eq!(keys.next().unwrap().key(), "my-key"); + /// # } + /// ``` + pub async fn keys(&self) -> Option> { + Some(Box::new( + self.cache + .keys() + .await? + .filter(|key| key.starts_with(&self.prefix)), + )) + } + // -- Sized insert/get (internal, shared by sized and unsized paths) -------- async fn insert_with_id( @@ -557,7 +591,7 @@ impl CacheStats { #[cfg(test)] mod tests { use super::*; - use std::collections::HashMap; + use std::collections::{BTreeSet, HashMap}; use std::marker::PhantomData; struct TestKey { @@ -609,6 +643,18 @@ mod tests { } } + fn key_fields(keys: &[InternalCacheKey]) -> BTreeSet<(String, String, &'static str)> { + keys.iter() + .map(|key| { + ( + key.prefix().to_string(), + key.key().to_string(), + key.type_name(), + ) + }) + .collect() + } + #[tokio::test] async fn test_cache_bytes() { let item = Arc::new(vec![1, 2, 3]); @@ -718,6 +764,99 @@ mod tests { assert_eq!(base.stats().await.hits, 1); } + #[tokio::test] + async fn test_cache_keys_with_prefixes() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let nested = prefixed.with_key_prefix("index"); + let other = base.with_key_prefix("ns-other"); + + base.insert_with_key(&TestKey::new("root"), Arc::new(vec![0])) + .await; + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + nested + .insert_with_key(&TestKey::new("nested"), Arc::new(vec![2])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![3])) + .await; + + let base_keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&base_keys), + BTreeSet::from([ + ( + "".to_string(), + "root".to_string(), + TestKey::>::type_name() + ), + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ( + "ns-other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + ), + ]) + ); + + let prefixed_keys = prefixed.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&prefixed_keys), + BTreeSet::from([ + ( + "ns/".to_string(), + "child".to_string(), + TestKey::>::type_name() + ), + ( + "ns/index/".to_string(), + "nested".to_string(), + TestKey::>::type_name() + ), + ]) + ); + } + + #[tokio::test] + async fn test_cache_keys_reflect_invalidation_and_clear() { + let base = LanceCache::with_capacity(1000); + let prefixed = base.with_key_prefix("ns"); + let other = base.with_key_prefix("other"); + + prefixed + .insert_with_key(&TestKey::new("child"), Arc::new(vec![1])) + .await; + other + .insert_with_key(&TestKey::new("other"), Arc::new(vec![2])) + .await; + assert_eq!(base.keys().await.unwrap().count(), 2); + + prefixed.invalidate_prefix("").await; + let keys = base.keys().await.unwrap().collect::>(); + assert_eq!( + key_fields(&keys), + BTreeSet::from([( + "other/".to_string(), + "other".to_string(), + TestKey::>::type_name() + )]) + ); + + base.clear().await; + assert_eq!(base.keys().await.unwrap().count(), 0); + } + #[tokio::test] async fn test_cache_get_or_insert() { let cache = LanceCache::with_capacity(1000); @@ -833,6 +972,7 @@ mod tests { .await .is_none() ); + assert!(cache.keys().await.is_none()); } #[tokio::test] diff --git a/rust/lance-core/src/cache/moka.rs b/rust/lance-core/src/cache/moka.rs index 6be7760458a..a3956c1720c 100644 --- a/rust/lance-core/src/cache/moka.rs +++ b/rust/lance-core/src/cache/moka.rs @@ -11,7 +11,7 @@ use futures::Future; use crate::Result; use super::CacheCodec; -use super::backend::{CacheBackend, CacheEntry, InternalCacheKey}; +use super::backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; /// Internal record stored in the moka cache. #[derive(Clone, Debug)] @@ -123,6 +123,13 @@ impl CacheBackend for MokaCacheBackend { self.cache.run_pending_tasks().await; } + async fn keys(&self) -> Option> { + self.cache.run_pending_tasks().await; + Some(Box::new( + self.cache.iter().map(|(key, _)| key.as_ref().clone()), + )) + } + async fn num_entries(&self) -> usize { self.cache.run_pending_tasks().await; self.cache.entry_count() as usize diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs index 484d53c066a..8d5e9717570 100644 --- a/rust/lance/src/session.rs +++ b/rust/lance/src/session.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use std::sync::Arc; -use lance_core::cache::{CacheBackend, LanceCache}; +use lance_core::cache::{CacheBackend, CacheKeyIterator, LanceCache}; use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; use lance_index::IndexType; @@ -209,6 +209,44 @@ impl Session { pub async fn index_cache_stats(&self) -> lance_core::cache::CacheStats { self.index_cache.0.stats().await } + + /// Return an iterator over keys currently held by the index cache. + /// + /// Returns `None` when the index cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.index_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn index_cache_keys(&self) -> Option> { + self.index_cache.0.keys().await + } + + /// Return an iterator over keys currently held by the metadata cache. + /// + /// Returns `None` when the metadata cache backend does not support key + /// inventory. + /// + /// # Examples + /// + /// ``` + /// # use lance::session::Session; + /// # async fn example() { + /// let session = Session::default(); + /// let keys = session.metadata_cache_keys().await; + /// assert!(keys.is_some()); + /// # } + /// ``` + pub async fn metadata_cache_keys(&self) -> Option> { + self.metadata_cache.0.keys().await + } } impl Default for Session { @@ -224,10 +262,23 @@ impl Default for Session { #[cfg(test)] mod tests { use super::*; - use lance_core::cache::UnsizedCacheKey; + use lance_core::cache::{CacheKey, UnsizedCacheKey}; use lance_index::vector::VectorIndex; use std::borrow::Cow; + struct TestKey(&'static str); + impl CacheKey for TestKey { + type ValueType = Vec; + + fn key(&self) -> Cow<'_, str> { + Cow::Borrowed(self.0) + } + + fn type_name() -> &'static str { + "TestVec" + } + } + struct TestUnsizedKey(&'static str); impl UnsizedCacheKey for TestUnsizedKey { type ValueType = dyn VectorIndex; @@ -251,4 +302,41 @@ mod tests { .is_none() ); } + + #[tokio::test] + async fn test_session_cache_keys() { + let session = Session::new(10_000, 10_000, Default::default()); + + session + .index_cache + .insert_with_key(&TestKey("index-key"), Arc::new(vec![1])) + .await; + session + .metadata_cache + .0 + .insert_with_key(&TestKey("metadata-key"), Arc::new(vec![2])) + .await; + + let index_keys = session + .index_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(index_keys.len(), 1); + assert_eq!(index_keys[0].prefix(), ""); + assert_eq!(index_keys[0].key(), "index-key"); + assert_eq!(index_keys[0].type_name(), "TestVec"); + + let metadata_keys = session + .metadata_cache_keys() + .await + .unwrap() + .collect::>(); + assert_eq!(metadata_keys.len(), 1); + assert_eq!(metadata_keys[0].prefix(), ""); + assert_eq!(metadata_keys[0].key(), "metadata-key"); + assert_eq!(metadata_keys[0].type_name(), "TestVec"); + + assert_ne!(index_keys, metadata_keys); + } } From 6e734df607f2841fe3bba82f05a90f3174933bab Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Wed, 17 Jun 2026 00:13:40 +0000 Subject: [PATCH 120/177] chore: release beta version 8.0.0-beta.16 --- .bumpversion.toml | 2 +- Cargo.lock | 48 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 40 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 40 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 90 insertions(+), 90 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 61131e9d8b8..fe30629b529 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.15" +current_version = "8.0.0-beta.16" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index ac919e25a1d..63819e6f678 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4388,7 +4388,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "all_asserts", "approx", @@ -4491,7 +4491,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4539,7 +4539,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrayref", "paste", @@ -4548,7 +4548,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4588,7 +4588,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4621,7 +4621,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "proc-macro2", "quote", @@ -4649,7 +4649,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -4694,7 +4694,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "all_asserts", "arrow", @@ -4720,7 +4720,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -4759,7 +4759,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "datafusion", "geo-traits", @@ -4773,7 +4773,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "approx", "arc-swap", @@ -4850,7 +4850,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-arith", @@ -4898,7 +4898,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "approx", "arrow-array", @@ -4917,7 +4917,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "async-trait", @@ -4929,7 +4929,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-schema", @@ -4945,7 +4945,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -5009,7 +5009,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -5027,7 +5027,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -5073,7 +5073,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "proc-macro2", "quote", @@ -5082,7 +5082,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-schema", @@ -5095,7 +5095,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5107,7 +5107,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 5a888fdd1cb..6e79a26e69f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.15", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.15", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.15", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.15", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.15", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.15", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.15", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.15", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.15", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.15", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.15", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.15", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.15", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.15", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.16", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.16", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.16", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.16", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.16", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.16", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.16", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.16", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.16", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.16", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.16", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.16", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.16", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.16", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.15", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.15", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.15", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.15", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.15", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.16", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.16", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.16", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.16", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.16", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.15", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.16", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.15", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.16", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 39121f92a2c..5dba72718b3 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3673,7 +3673,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arc-swap", "arrow", @@ -3746,7 +3746,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -3788,7 +3788,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrayref", "paste", @@ -3797,7 +3797,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -3835,7 +3835,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -3867,7 +3867,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "proc-macro2", "quote", @@ -3893,7 +3893,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -3928,7 +3928,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -3958,7 +3958,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "datafusion", "geo-traits", @@ -3972,7 +3972,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arc-swap", "arrow", @@ -4040,7 +4040,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-arith", @@ -4081,7 +4081,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4117,7 +4117,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4132,7 +4132,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "async-trait", @@ -4144,7 +4144,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-ipc", @@ -4193,7 +4193,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4208,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4245,7 +4245,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index c759a59858b..12ae647ab58 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index fc109711a07..c9e4dcf8a9a 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.15 + 8.0.0-beta.16 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 5275411197f..f4e52846476 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4075,7 +4075,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arc-swap", "arrow", @@ -4149,7 +4149,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4191,7 +4191,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrayref", "paste", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4270,7 +4270,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4287,7 +4287,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "proc-macro2", "quote", @@ -4296,7 +4296,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -4331,7 +4331,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-arith", "arrow-array", @@ -4361,7 +4361,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "datafusion", "geo-traits", @@ -4375,7 +4375,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arc-swap", "arrow", @@ -4444,7 +4444,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-arith", @@ -4485,7 +4485,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4500,7 +4500,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "async-trait", @@ -4512,7 +4512,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-ipc", @@ -4561,7 +4561,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow-array", "arrow-buffer", @@ -4576,7 +4576,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "arrow", "arrow-array", @@ -4615,7 +4615,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6058,7 +6058,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index 3bfdc548fca..db4f26d80c7 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.15" +version = "8.0.0-beta.16" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 4eace839d5d41a38ea06a44a4f9d7778b678d3ca Mon Sep 17 00:00:00 2001 From: George Stamatakis <126914070+gstamatakis95@users.noreply.github.com> Date: Wed, 17 Jun 2026 06:24:14 +0200 Subject: [PATCH 121/177] fix(rust): fixed IVF_RQ indexes ignoring the frag-reuse index (#7217) Closes #7216 After compaction with defer_index_remap=true, covering vector indexes are expected to remap their stored row addresses through the __lance_frag_reuse index at load time. The IVF_PQ storage does this, but RabitQuantizationStorage::try_from_batch received the frag-reuse index as a parameter named _fri and discarded it. As a result an IVF_RQ index loaded from a debt-carrying version kept its pre-compaction row addresses, and any ANN query that fetched row content failed at the scanner take stage with take operation specified fragment id N but this fragment does not exist. The IVF state cache reconstruction path in reconstruct_typed passed a hard-coded None for the frag-reuse index, so even with the storage fixed the remap was skipped whenever the index was rebuilt from a cached state. The cache key already includes the frag-reuse uuid, so cached states with and without remap debt stay distinct when the index is threaded through. This PR makes the following changes: In rust/lance-index/src/vector/bq/storage.rs, try_from_batch now builds a row id mapping for the partition from FragReuseIndex::remap_row_id, mirroring the PQ derivation, and applies the storage's existing remap method. Reusing remap instead of porting the PQ inline rewrite is deliberate, because RQ codes live in a packed and permuted SIMD layout and remap already performs the layout-correct filter and repack, including the multi-bit split ex code and factor columns. It is already pinned by the existing identity-remap regression tests. When no frag-reuse index is present, when its row id maps are empty, or when no row in the partition is affected, the path stays a no-op. The generic remap_row_ids_record_batch helper used by the flat and SQ storages was considered and does not fit here, both because it assumes a two column batch and because a plain row-wise take is not safe on the packed code layout. In rust/lance/src/index/vector/ivf/v2.rs and rust/lance/src/index.rs, IvfStateEntry::reconstruct gains a frag-reuse parameter and the caller passes open_frag_reuse_index, which is already cached in the index cache, closing the warm cache hole without adding load overhead. In rust/lance/src/dataset/optimize.rs, a regression test test_read_ivf_rq_index_v3_with_defer_index_remap is added next to the PQ analog. It queries with a non-empty projection because the take stage where this bug manifests is only exercised when row content is fetched. The existing PQ test uses an empty projection and never reaches that stage, which is why this went undetected. Verification: with the storage remap reverted, the new test fails with the exact defect signature from scanner.rs. With the fix it passes. cargo test -p lance defer_index_remap passes (the new RQ test plus the existing PQ tests), cargo test -p lance-index bq passes, and cargo fmt and cargo clippy -- -D warnings are clean on the touched crates. --- rust/lance-index/src/vector/bq/storage.rs | 43 +++++++- rust/lance/src/dataset/optimize.rs | 119 ++++++++++++++++++++++ rust/lance/src/index.rs | 8 +- rust/lance/src/index/vector/ivf/v2.rs | 7 +- 4 files changed, 172 insertions(+), 5 deletions(-) diff --git a/rust/lance-index/src/vector/bq/storage.rs b/rust/lance-index/src/vector/bq/storage.rs index 9c355d26960..2f4fe69792a 100644 --- a/rust/lance-index/src/vector/bq/storage.rs +++ b/rust/lance-index/src/vector/bq/storage.rs @@ -2382,6 +2382,38 @@ pub fn unpack_codes(codes: &FixedSizeListArray) -> FixedSizeListArray { FixedSizeListArray::try_new_from_values(UInt8Array::from(unpacked), code_len as i32).unwrap() } +/// Build a row-id remapping for the rows present in this partition from a +/// fragment-reuse index, mirroring the PQ storage frag-reuse path. +/// +/// Returns `None` when there is nothing to do (no fragment-reuse index, or the +/// index leaves every present row id unchanged), so callers keep the zero-cost +/// no-op path. Otherwise, returns a `HashMap` mapping every affected old row id +/// to `Some(new_id)` for surviving rows or `None` for rows whose covering +/// fragment was compacted away, suitable for `RabitQuantizationStorage::remap`. +fn build_frag_reuse_mapping( + fri: Option<&FragReuseIndex>, + row_ids: &UInt64Array, +) -> Option>> { + let fri = fri?; + if fri.row_id_maps.is_empty() { + return None; + } + let mut mapping: HashMap> = HashMap::new(); + for row_id in row_ids.values().iter() { + match fri.remap_row_id(*row_id) { + Some(new_id) if new_id == *row_id => {} + mapped => { + mapping.insert(*row_id, mapped); + } + } + } + if mapping.is_empty() { + None + } else { + Some(mapping) + } +} + #[async_trait] impl QuantizerStorage for RabitQuantizationStorage { type Metadata = RabitQuantizationMetadata; @@ -2390,7 +2422,7 @@ impl QuantizerStorage for RabitQuantizationStorage { batch: RecordBatch, metadata: &Self::Metadata, distance_type: DistanceType, - _fri: Option>, + fri: Option>, ) -> Result { let distance_type = match (metadata.query_estimator, distance_type) { (RabitQueryEstimator::RawQuery, DistanceType::Cosine) => DistanceType::L2, @@ -2486,7 +2518,7 @@ impl QuantizerStorage for RabitQuantizationStorage { let packed_ex_codes = maybe_pack_ex_codes(ex_codes.as_ref(), ex_bits, error_factors.as_ref()); - Ok(Self { + let storage = Self { metadata, batch, distance_type, @@ -2499,7 +2531,12 @@ impl QuantizerStorage for RabitQuantizationStorage { packed_ex_codes, ex_add_factors, ex_scale_factors, - }) + }; + + match build_frag_reuse_mapping(fri.as_deref(), &storage.row_ids) { + Some(mapping) => storage.remap(&mapping), + None => Ok(storage), + } } fn metadata(&self) -> &Self::Metadata { diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 56cf74c1a62..4b5f3505f69 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -4690,6 +4690,125 @@ mod tests { ); } + #[tokio::test] + async fn test_read_ivf_rq_index_v3_with_defer_index_remap() { + use arrow_array::cast::AsArray; + use lance_index::vector::bq::RQBuildParams; + + let mut dataset = lance_datagen::gen_batch() + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(128)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + let stored: Vec> = { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut out = Vec::new(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let values = vecs.value(i); + let values = values.as_primitive::(); + out.push(values.values().to_vec()); + } + } + out + }; + + let index_name = Some("vec_idx".into()); + dataset + .create_index( + &["vec"], + IndexType::Vector, + index_name.clone(), + &VectorIndexParams { + metric_type: DistanceType::L2, + stages: vec![ + StageParams::Ivf(IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + }), + StageParams::RQ(RQBuildParams::new(1)), + ], + version: crate::index::vector::IndexFileVersion::V3, + skip_transpose: false, + runtime_hints: Default::default(), + }, + false, + ) + .await + .unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let original_index = indices.iter().find(|idx| idx.name == "vec_idx").unwrap(); + + let options = CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }; + let metrics = compact_files(&mut dataset, options, None).await.unwrap(); + assert!(metrics.fragments_removed > 0); + assert!(metrics.fragments_added > 0); + + let Some(current_index) = dataset.load_index_by_name("vec_idx").await.unwrap() else { + panic!("vec index must be available"); + }; + assert_eq!(current_index.uuid, original_index.uuid); + + let frag_reuse_present = dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME); + assert!( + frag_reuse_present, + "defer_index_remap must record a {} index", + FRAG_REUSE_INDEX_NAME + ); + + let sample_step = (stored.len() / 8).max(1); + let mut checked = 0; + for query in stored.iter().step_by(sample_step) { + let query_vec = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &query_vec, 5).unwrap(); + scanner.project(&["vec"]).unwrap().with_row_id(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert!(!batches.is_empty(), "query returned no batches"); + let top = &batches[0]; + assert!(top.num_rows() > 0, "query returned empty top batch"); + let top_vec = top["vec"].as_fixed_size_list().value(0); + let top_vec = top_vec.as_primitive::(); + assert_eq!( + top_vec.values(), + query.as_slice(), + "top-1 self-recall returned a different vector than the query" + ); + checked += 1; + } + assert!(checked > 0, "expected to check at least one stored vector"); + } + #[tokio::test] async fn test_default_compaction_planner() { let test_dir = TempStrDir::default(); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 69acc69b6da..6a61e6fb0d6 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -1899,9 +1899,15 @@ impl DatasetIndexInternalExt for Dataset { if let Some(entry) = self.index_cache.get_with_key(&state_key).await { log::debug!("Found IvfIndexState in cache uuid: {}", uuid); let partition_cache = self.index_cache.with_key_prefix(&state_key.key()); + let frag_reuse_index = self.open_frag_reuse_index(metrics).await?; return entry .0 - .reconstruct(object_store, self.metadata_cache.as_ref(), partition_cache) + .reconstruct( + object_store, + self.metadata_cache.as_ref(), + partition_cache, + frag_reuse_index, + ) .await; } diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 202a4423d49..2c91f6311ab 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -253,6 +253,7 @@ pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>>; } @@ -435,6 +436,7 @@ impl IvfStateEntry for IvfIndexState { object_store: Arc, file_metadata_cache: &'a LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> BoxFuture<'a, Result>> { Box::pin(async move { match self.sub_index_type { @@ -444,6 +446,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -453,6 +456,7 @@ impl IvfStateEntry for IvfIndexState { object_store, file_metadata_cache, index_cache, + frag_reuse_index, ) .await } @@ -1857,6 +1861,7 @@ async fn reconstruct_typed( object_store: Arc, file_metadata_cache: &LanceCache, index_cache: LanceCache, + frag_reuse_index: Option>, ) -> Result> { let io_parallelism = object_store.io_parallelism(); @@ -1912,7 +1917,7 @@ async fn reconstruct_typed( state.aux_ivf.clone(), state.metadata.clone(), state.distance_type, - None, + frag_reuse_index, ); let rq_search_cache = IVFIndex::::rq_search_cache_from_state(state, &storage)?; From 747fa0bba44a5ebc47cc0251f2180a9ce928a485 Mon Sep 17 00:00:00 2001 From: XY Zhan Date: Wed, 17 Jun 2026 00:28:14 -0400 Subject: [PATCH 122/177] fix: remap index data when its fragment bitmap was already coverage-remapped (#7286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `remap_column_index` can silently skip remapping an index's data after a deferred compaction, which leaves the index permanently dependent on the fragment-reuse index and prevents that reuse index from ever being trimmed (it grows without bound). This happens whenever the index's `fragment_bitmap` has already been coverage-remapped onto the new fragments before the data remap runs — most reliably when a table carries **more than one index**. The fix makes `remap_index` decide whether to remap an index's *data* from whether that data predates the reuse version, instead of inferring it from the current state of the fragment bitmap (which can be cleaned independently of the data). ## Background With `defer_index_remap: true`, compaction rewrites fragments but does **not** rewrite index data inline. Instead it records a fragment-reuse index (FRI) holding, per compaction, the old→new row-address map. Two remaps then happen at different times: - **Coverage remap** — `load_indices` (`rust/lance/src/index.rs`) applies `FragReuseIndex::remap_fragment_bitmap` to every index's `fragment_bitmap` in memory, swapping the compacted-away fragment ids for the new ones, so queries route correctly. - **Data remap** — `remap_column_index` / `remap_index` (`rust/lance/src/dataset/optimize/remapping.rs`) actually rewrites the row addresses stored *inside* the index, and advances the index's `dataset_version`. Until this runs, the index data still holds the old addresses and relies on FRI auto-remap-at-load for correctness. `cleanup_frag_reuse_index` can only trim a reuse version once **every** index is "caught up" with it (`is_index_remap_caught_up`), which requires both that the index no longer references the version's old fragments **and** that `index.dataset_version >= reuse_version.dataset_version`. ## The bug After a deferred compaction, remapping the indexes one by one leaves some of them un-remapped, and the reuse index never trims. Reproduced deterministically with two scalar indexes on one table (see the added test). Observed state after compaction + remapping both indexes: | index | `dataset_version` | `fragment_bitmap` | |---|---|---| | FRI version | 4 | — | | first index remapped | 6 (advanced ✓) | `[10]` | | second index | 2 (**stale**) | `[10]` | | third index | 3 (**stale**) | `[10]` | The second/third indexes' bitmaps are already clean (`[10]`, no old fragments), but their data was never remapped and their `dataset_version` never advanced — so `is_index_remap_caught_up` returns false on the `dataset_version < reuse_version.dataset_version` check, and the reuse version is retained forever. ## Root cause The deferred-compaction commit itself does **not** coverage-remap the on-disk bitmaps (with stable row ids disabled, `Operation::Rewrite` takes the `handle_rewrite_indices` path with an empty `rewritten_indices`, leaving bitmaps untouched). The cleaned bitmap is introduced later, by the **remap path itself**: 1. `remap_index` begins with `dataset.load_indices()`, which returns every index's bitmap **coverage-remapped in memory** onto the new fragments. 2. When it commits the **first** index's remap, the new manifest is built from that coverage-remapped in-memory index list — so the cleaned bitmap is **persisted to disk for the other, not-yet-remapped indexes**, without their data being remapped or their `dataset_version` being advanced. 3. `remap_index` for those remaining indexes computes `should_remap` from `old_frag_in_index > 0`. The old fragments are already gone from the persisted bitmap, so `should_remap` is `false` → the data remap is skipped → the index keeps its stale addresses and stale `dataset_version` → the reuse index can never be trimmed. In short: the fragment bitmap is an unreliable signal for "has this index's data been remapped?", because coverage remap and data remap are decoupled and the cleaned bitmap can be persisted before the data remap happens. ### Why not "fix" the trim check instead? The `dataset_version` guard in `is_index_remap_caught_up` is correct and protective: an index whose bitmap is coverage-remapped but whose **data** still holds old addresses genuinely still needs the reuse index for auto-remap-at-load. Advancing `dataset_version` (or trimming) on the strength of a coverage remap alone would discard the reuse index while the data is still stale → **incorrect query results**. So the data remap must actually run; the fix belongs in `remap_index`. ## The fix `rust/lance/src/dataset/optimize/remapping.rs`, in `remap_index`: when a rewrite group's **old** fragments are already gone from the bitmap but its **new** fragments are present, and the index data still predates the reuse version (`index.dataset_version < version.dataset_version`), set `should_remap = true` anyway. The bitmap is already correct in that case, so only the data is remapped; `dataset_version` then advances and the reuse index can trim. The existing "old fragments present" path is left byte-for-byte unchanged, so the normal (non-pre-cleaned) case — including chained single-index bitmap remapping — behaves exactly as before. ```rust let data_predates_version = curr_index_meta.dataset_version < version.dataset_version; // ... existing per-group loop ... if old_frag_in_index > 0 { // unchanged: rewrite the bitmap and remap the data } else if data_predates_version && group.new_frags.iter().any(|f| index_frag_bitmap.contains(f.id as u32)) { // bitmap already coverage-remapped + persisted before the data remap: // remap the data anyway (bitmap is already correct) should_remap = true; } ``` ## Testing - **New regression test** `test_cleanup_frag_reuse_index_multiple_indices` (`rust/lance/src/dataset/index/frag_reuse.rs`): two scalar indexes on one table, deferred compaction, remap both, then assert each index is caught up and the reuse index trims to zero versions. **Fails on `main`** (`index j_idx was not caught up after remap`), passes with this change. - Existing suites green with the fix: `frag_reuse` (incl. the single-index `test_cleanup_frag_reuse_index`), `remap` (incl. `test_remap_index_after_compaction`, which exercises chained single-index bitmap remapping), and `compaction`. - `cargo fmt` + `cargo clippy` clean. ## Scope / risk - ~20 lines, one new `else if` branch; no public API change. - The change only ever *enables* a data remap that should have happened; it cannot skip one the old code performed (the new branch is additive to the existing `should_remap` conditions). --- rust/lance/src/dataset/index/frag_reuse.rs | 87 ++++++++++++++++++++ rust/lance/src/dataset/optimize/remapping.rs | 20 +++++ 2 files changed, 107 insertions(+) diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index 4fbefcd4725..ed6f027e159 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -243,4 +243,91 @@ mod tests { Err(Error::RetryableCommitConflict { .. }) )); } + + /// With more than one index on the table, remapping every index must catch + /// all of them up so the reuse index can be trimmed. + /// + /// Regression: `remap_column_index` used to decide whether to remap an + /// index's data from the presence of the old fragments in its fragment + /// bitmap. But `load_indices` coverage-remaps the bitmap onto the new + /// fragments in memory, and remapping the *first* index commits a manifest + /// that persists that cleaned bitmap for the others — so remapping the + /// remaining indexes became a silent no-op (their data was never remapped + /// and their `dataset_version` never advanced), and the reuse index could + /// never be trimmed. + #[tokio::test] + async fn test_cleanup_frag_reuse_index_multiple_indices() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .col("j", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + for col in ["i", "j"] { + dataset + .create_index( + &[col], + IndexType::Scalar, + Some(format!("{col}_idx")), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + } + + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 1); + + for col in ["i", "j"] { + remapping::remap_column_index(&mut dataset, &[col], Some(format!("{col}_idx"))) + .await + .unwrap(); + } + + // Every index must now be caught up (data remapped, version advanced). + let indices = dataset.load_indices().await.unwrap(); + for col in ["i", "j"] { + let index = indices + .iter() + .find(|idx| idx.name == format!("{col}_idx")) + .unwrap(); + assert!( + is_index_remap_caught_up(&frag_reuse_details.versions[0], index).unwrap(), + "index {col}_idx was not caught up after remap" + ); + } + + // ... so the reuse index trims down to zero versions. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let frag_reuse_details = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap(); + assert_eq!(frag_reuse_details.versions.len(), 0); + } } diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs index dab62bf6166..ca1ed54f30f 100644 --- a/rust/lance/src/dataset/optimize/remapping.rs +++ b/rust/lance/src/dataset/optimize/remapping.rs @@ -235,6 +235,13 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { .find(|idx| idx.uuid == curr_index_id) .unwrap(); + // Whether the index data predates this reuse version, i.e. its stored + // row addresses still point at the compacted-away fragments. The + // fragment bitmap alone cannot tell us this: `load_indices` + // coverage-remaps the bitmap onto the new fragments in memory, and a + // later commit can persist that cleaned bitmap to disk without the index + // data ever being remapped (e.g. while remapping a *sibling* index). + let data_predates_version = curr_index_meta.dataset_version < version.dataset_version; let maybe_index_bitmap = curr_index_meta.fragment_bitmap.clone(); let (should_remap, bitmap_after_remap) = match maybe_index_bitmap { Some(mut index_frag_bitmap) => { @@ -261,6 +268,19 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { index_frag_bitmap .extend(group.new_frags.clone().into_iter().map(|f| f.id as u32)); should_remap = true; + } else if data_predates_version + && group + .new_frags + .iter() + .any(|new_frag| index_frag_bitmap.contains(new_frag.id as u32)) + { + // The bitmap was already coverage-remapped onto this + // group's new fragments and persisted before the data was + // remapped, so the old fragments are gone from the bitmap + // but the index data still needs remapping. Without this + // the data remap is silently skipped and the reuse index + // can never be trimmed. + should_remap = true; } } (should_remap, Some(index_frag_bitmap)) From d076a7a48fdfa13ce0bf4e9fd903457a3c94677f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 17 Jun 2026 10:13:33 -0700 Subject: [PATCH 123/177] feat: stabilize cache codec with a versioned envelope (#7163) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements #7160. Cache entries (FTS posting lists, scalar/vector index state) were serialized with an ad-hoc, unversioned format only safe to read in the same process that wrote it. This stabilizes the format so entries can live in a **node-agnostic, restart-surviving** cache backend. ## Wire format Each entry is an envelope followed by a body: ```text [magic "LCE1"][envelope_version: u8][type_id][type_version: u32] # envelope ``` Body sections, each self-delimiting: ```text HEADER : [len: u32][protobuf bytes] ARROW_IPC : [pad to 64B][self-delimiting IPC stream] RAW_BLOB : [len: u64][bytes] ``` ## Why this shape - **The envelope is hand-framed, not protobuf.** It's the most stability-critical part: it must parse robustly against *any* bytes (including old, pre-stabilization blobs) and never change shape. The magic is chosen so no prior blob can collide with it. - **Decode returns `Hit`/`Miss`, never a hard error.** Wrong/absent magic, an unknown envelope version, a `type_id` mismatch, a future `type_version`, or a body decode failure all become `Miss` → recompute. Old, foreign, or corrupt bytes self-heal with **zero migration code**. - **Bodies use protobuf headers.** Field-number evolution lets us add fields without a format break; only changes protobuf can't express transparently (reordering sections, changing a raw-blob encoding) bump `type_version`, which the reader branches on. - **Arrow IPC sections are 64-byte aligned** so concatenated sections decode zero-copy instead of a realigning `memcpy` on every read — this guards the FTS WAND hot path. - **`RAW_BLOB` is reserved for payloads with their own portable, self-describing encoding** (roaring bitmaps, the shared position stream). A codec with no scalar metadata (e.g. bitmap) simply omits the header — sections are positional, so nothing is written for an absent header. ## Scope All cache codecs migrated: FTS posting lists (compressed/plain/positions + groups), scalar indices (BTree/Bitmap/Flat/LabelList/RowAddrTreeMap), and the five IVF quantizer partitions + IVF state. The cache protos live in `lance-index/protos-cache/cache.proto` (`package lance.index.cache`) — they describe *library serialization*, not the on-disk format spec. ## Tests Envelope round-trip and every miss path; per-codec round-trip + through-envelope zero-copy alignment (incl. RabitQ Matrix rotation, multi-batch SQ, nested bitmap in a label-list entry); additive proto-field compat; existing IVF build+search suites pass through the migrated path. Closes #7160. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-arrow/src/ipc.rs | 216 +++++- rust/lance-core/src/cache/codec.rs | 517 +++++++++++-- rust/lance-core/src/cache/entry_io.rs | 202 +++++ rust/lance-core/src/cache/mod.rs | 6 +- rust/lance-index/build.rs | 11 +- rust/lance-index/protos-cache/cache.proto | 194 +++++ rust/lance-index/src/lib.rs | 7 + rust/lance-index/src/scalar/bitmap.rs | 112 ++- rust/lance-index/src/scalar/btree.rs | 218 +++--- rust/lance-index/src/scalar/btree/flat.rs | 76 +- .../src/scalar/inverted/cache_codec.rs | 715 +++++++++++------- rust/lance-index/src/scalar/label_list.rs | 118 ++- rust/lance-select/src/mask.rs | 15 +- rust/lance-table/src/format/index.rs | 27 +- rust/lance/src/dataset/tests/dataset_index.rs | 6 +- .../src/index/vector/ivf/partition_serde.rs | 628 ++++++++------- rust/lance/src/index/vector/ivf/v2.rs | 99 +-- 17 files changed, 2329 insertions(+), 838 deletions(-) create mode 100644 rust/lance-core/src/cache/entry_io.rs create mode 100644 rust/lance-index/protos-cache/cache.proto diff --git a/rust/lance-arrow/src/ipc.rs b/rust/lance-arrow/src/ipc.rs index 1c6364c4525..8b6e5cf41fe 100644 --- a/rust/lance-arrow/src/ipc.rs +++ b/rust/lance-arrow/src/ipc.rs @@ -270,7 +270,7 @@ pub fn read_ipc_stream_single_at( /// Modern IPC streams have an 8-byte prefix `[continuation: 4][size: 4]`. /// Legacy streams have a 4-byte prefix `[size: 4]`. Returns `(prefix_len, meta_size)`. fn parse_ipc_message_prefix(buf: &Buffer) -> Result<(usize, usize), ArrowError> { - let has_continuation = buf.len() >= 4 && buf[..4] == [0xff; 4]; + let has_continuation = buf.len() >= 4 && buf[..4] == IPC_CONTINUATION; if has_continuation { if buf.len() < 8 { return Err(ArrowError::ParseError( @@ -358,6 +358,134 @@ pub fn read_ipc_stream_single(data: &Bytes) -> Result { } } +// --------------------------------------------------------------------------- +// Aligned IPC sections +// --------------------------------------------------------------------------- + +/// Byte alignment that each IPC section's stream start is padded to. +/// +/// When several IPC streams are concatenated into one larger blob (e.g. a +/// cache entry), a section that starts at an arbitrary offset would leave its +/// array data misaligned. [`FileDecoder`] with `require_alignment = false` +/// then silently copies each buffer into a freshly aligned allocation on +/// every read, defeating zero-copy. Padding each section start to a 64-byte +/// boundary keeps the decoded buffers borrowed directly from the input. +pub const IPC_SECTION_ALIGNMENT: usize = 64; + +/// Number of zero-padding bytes needed to advance `pos` to the next +/// [`IPC_SECTION_ALIGNMENT`] boundary. +fn section_padding(pos: usize) -> usize { + (IPC_SECTION_ALIGNMENT - (pos % IPC_SECTION_ALIGNMENT)) % IPC_SECTION_ALIGNMENT +} + +/// A [`Write`] adapter that counts the bytes written through it. +struct CountingWriter<'a> { + inner: &'a mut dyn Write, + count: usize, +} + +impl Write for CountingWriter<'_> { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let n = self.inner.write(buf)?; + self.count += n; + Ok(n) + } + + fn flush(&mut self) -> std::io::Result<()> { + self.inner.flush() + } +} + +/// Write zero padding so the next byte lands on an [`IPC_SECTION_ALIGNMENT`] +/// boundary, advancing `pos` past it. +fn write_section_padding(writer: &mut dyn Write, pos: &mut usize) -> Result<(), ArrowError> { + let pad = section_padding(*pos); + if pad > 0 { + const ZEROS: [u8; IPC_SECTION_ALIGNMENT] = [0u8; IPC_SECTION_ALIGNMENT]; + writer + .write_all(&ZEROS[..pad]) + .map_err(|e| ArrowError::IoError(e.to_string(), e))?; + *pos += pad; + } + Ok(()) +} + +/// Write `batch` as a 64-byte-aligned single-batch Arrow IPC section. +/// +/// `pos` is the absolute byte offset of `writer` within the enclosing blob. +/// Zero padding is written first so the IPC stream begins on an +/// [`IPC_SECTION_ALIGNMENT`] boundary, then the stream itself. `pos` is +/// advanced past both the padding and the stream so the caller can write +/// further aligned sections. +/// +/// Paired with [`read_ipc_section_at`]. For the decoded buffers to be borrowed +/// zero-copy, the blob must ultimately be read back from a buffer whose base +/// address is at least 64-byte aligned. +pub fn write_ipc_section( + writer: &mut dyn Write, + pos: &mut usize, + batch: &RecordBatch, +) -> Result<(), ArrowError> { + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream(batch, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read a single [`RecordBatch`] from an aligned IPC section at `offset`. +/// +/// Skips the alignment padding written by [`write_ipc_section`], then reads +/// the stream, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section`]). +pub fn read_ipc_section_at(data: &Bytes, offset: &mut usize) -> Result { + *offset += section_padding(*offset); + read_ipc_stream_single_at(data, offset) +} + +/// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC section. +/// +/// Like [`write_ipc_section`] but emits every batch from `iter` into one IPC +/// stream (schema + N batches + EOS). `iter` must yield at least one batch. +/// Paired with [`read_ipc_section_batches_at`]. +pub fn write_ipc_section_batches( + writer: &mut dyn Write, + pos: &mut usize, + iter: I, +) -> Result<(), ArrowError> +where + I: IntoIterator, +{ + write_section_padding(writer, pos)?; + + let mut counting = CountingWriter { + inner: writer, + count: 0, + }; + write_ipc_stream_batches(iter, &mut counting)?; + *pos += counting.count; + Ok(()) +} + +/// Read all [`RecordBatch`]es from an aligned multi-batch IPC section at +/// `offset`, advancing `offset` past the section (padding + stream + EOS). +/// +/// Zero-copy: array buffers borrow from `data`'s allocation when `data`'s base +/// address is at least 64-byte aligned (see [`write_ipc_section_batches`]). +pub fn read_ipc_section_batches_at( + data: &Bytes, + offset: &mut usize, +) -> Result, ArrowError> { + *offset += section_padding(*offset); + read_ipc_stream_at(data, offset) +} + #[cfg(test)] mod tests { use arrow_array::{ArrayRef, record_batch}; @@ -403,4 +531,90 @@ mod tests { assert_col_zero_copy(batch.column(1)); } } + + /// Allocate a [`Bytes`] whose base address is 64-byte aligned, modelling a + /// backend that reads cache entries into an aligned buffer. A plain + /// `Bytes::from(vec)` only guarantees the allocator's alignment for `u8`. + fn aligned_bytes(payload: &[u8]) -> Bytes { + let mut v = vec![0u8; payload.len() + IPC_SECTION_ALIGNMENT]; + let pad = section_padding(v.as_ptr() as usize); + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + #[test] + fn test_aligned_ipc_sections_are_zero_copy() { + // A LargeBinary column exercises the i64-offset buffer whose 8-byte + // alignment requirement triggers a realigning memcpy when misaligned. + let blocks = arrow_array::LargeBinaryArray::from_vec(vec![&b"hello"[..], b"world"]); + let section_a = RecordBatch::try_from_iter([("a", Arc::new(blocks) as ArrayRef)]).unwrap(); + let section_b = record_batch!(("b", Int64, [10i64, 20, 30, 40, 50])).unwrap(); + + let mut buf = Vec::new(); + // Arbitrary, deliberately non-64-aligned preamble so the first section + // must be padded rather than landing at offset 0 by luck. + buf.extend_from_slice(&[0xABu8; 7]); + let mut pos = buf.len(); + // The first section's stream begins after padding the 7-byte preamble + // up to the next 64-byte boundary. + assert_eq!(7 + section_padding(7), IPC_SECTION_ALIGNMENT); + write_ipc_section(&mut buf, &mut pos, §ion_a).unwrap(); + write_ipc_section(&mut buf, &mut pos, §ion_b).unwrap(); + + let data = aligned_bytes(&buf); + assert_eq!( + section_padding(data.as_ptr() as usize), + 0, + "base not aligned" + ); + + let mut offset = 7; + let read_a = read_ipc_section_at(&data, &mut offset).unwrap(); + let read_b = read_ipc_section_at(&data, &mut offset).unwrap(); + assert_eq!(read_a, section_a); + assert_eq!(read_b, section_b); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for batch in [&read_a, &read_b] { + for buffer in batch.column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "section buffer at {ptr:#x} was realigned out of the input \ + [{data_base:#x}..{data_end:#x}) — misaligned section", + ); + } + } + } + + #[test] + fn test_aligned_multi_batch_section_roundtrip_zero_copy() { + // A multi-batch section (e.g. IVF SQ storage chunks) must round-trip + // every batch and decode the first batch's buffers zero-copy. + let b1 = record_batch!(("v", Int64, [1i64, 2, 3])).unwrap(); + let b2 = record_batch!(("v", Int64, [4i64, 5])).unwrap(); + let b3 = record_batch!(("v", Int64, [6i64])).unwrap(); + + let mut buf = vec![0xCDu8; 5]; + let mut pos = buf.len(); + write_ipc_section_batches(&mut buf, &mut pos, [b1.clone(), b2.clone(), b3.clone()]) + .unwrap(); + + let data = aligned_bytes(&buf); + let mut offset = 5; + let read = read_ipc_section_batches_at(&data, &mut offset).unwrap(); + assert_eq!(read, vec![b1, b2, b3]); + assert_eq!(offset, buf.len(), "offset should land at section end"); + + let data_base = data.as_ptr() as usize; + let data_end = data_base + data.len(); + for buffer in read[0].column(0).to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= data_base && ptr < data_end, + "first batch buffer at {ptr:#x} was realigned out of the input", + ); + } + } } diff --git a/rust/lance-core/src/cache/codec.rs b/rust/lance-core/src/cache/codec.rs index 34e5264bb28..bba54840829 100644 --- a/rust/lance-core/src/cache/codec.rs +++ b/rust/lance-core/src/cache/codec.rs @@ -5,12 +5,184 @@ //! //! Implement [`CacheCodecImpl`] on concrete types, then use //! [`CacheCodec::from_impl`] to produce a type-erased codec for the cache. +//! +//! # Wire format +//! +//! Every serialized entry begins with a small hand-framed **envelope** so the +//! reader can validate it before trusting the body: +//! +//! ```text +//! [magic: 4B = b"LCE1"] +//! [envelope_version: u8] +//! [type_id_len: u16 LE][type_id: utf8] # stable, author-assigned +//! [type_version: u32 LE] # per-type body schema version +//! +//! ``` +//! +//! The envelope is deliberately *not* protobuf: it is the most +//! stability-critical part, must parse robustly against arbitrary bytes +//! (including data written by older, pre-stabilization builds), and never +//! changes shape. Bodies use protobuf headers, where field-number evolution +//! pays off. +//! +//! # Decode outcome +//! +//! Deserialization never propagates a parse failure as a hard error into the +//! cache path. Anything the reader cannot confidently interpret — absent or +//! wrong magic, an unknown `envelope_version`, a `type_id` mismatch, an +//! unsupported `type_version`, or a body decode error — becomes +//! [`CacheDecode::Miss`]. A backend turns `Miss` into a normal cache miss and +//! recomputes the value. This is what lets data written by an older format +//! self-heal: it simply fails the magic check and is regenerated. +use std::io::Write; use std::sync::Arc; use bytes::Bytes; -use crate::Result; +use crate::{Error, Result}; + +use super::{CacheEntryReader, CacheEntryWriter}; + +// --------------------------------------------------------------------------- +// Envelope +// --------------------------------------------------------------------------- + +/// Magic bytes that prefix every stabilized cache entry. +/// +/// An ASCII tag (`0x4C 0x43 0x45 0x31`) chosen so it cannot collide with any +/// pre-stabilization blob: those began with either a small little-endian +/// length (tens of bytes) or a small tag byte, never these values. +/// +/// Exported so backends can cheaply identify Lance cache entries (e.g. when +/// scanning a persistent store at startup) without hardcoding the bytes — +/// prefer [`has_cache_envelope`] over comparing against this directly. +pub const MAGIC: [u8; 4] = *b"LCE1"; + +/// Returns `true` if `data` begins with the cache-entry [`MAGIC`]. +/// +/// A cheap prefix check for backends that need to recognize Lance cache +/// entries without fully [`deserialize`](CacheCodec::deserialize)-ing them. A +/// `true` result only means the framing looks like ours; the entry can still +/// decode to a [`Miss`](CacheDecode::Miss) (e.g. wrong `type_id`). +pub fn has_cache_envelope(data: &[u8]) -> bool { + data.get(..MAGIC.len()) == Some(&MAGIC[..]) +} + +/// Version of the envelope framing itself. Bumped only if the outer frame +/// (magic/version/type_id/type_version layout) ever changes — expected never. +const ENVELOPE_VERSION: u8 = 1; + +/// Parsed envelope borrowed from the input bytes. +struct ParsedEnvelope<'a> { + type_id: &'a str, + type_version: u32, + /// Offset of the first body byte within the input. + body_offset: usize, +} + +/// Parse and validate the envelope at the start of `data`. +/// +/// Returns `None` for anything that is not a well-formed envelope this build +/// understands (wrong/absent magic, unknown `envelope_version`, truncation, +/// non-utf8 `type_id`). Callers translate `None` into [`CacheDecode::Miss`]. +fn parse_envelope(data: &Bytes) -> Option> { + let bytes = data.as_ref(); + let mut off = 0usize; + + let magic = bytes.get(off..off + 4)?; + if magic != MAGIC { + return None; + } + off += 4; + + if *bytes.get(off)? != ENVELOPE_VERSION { + return None; + } + off += 1; + + let type_id_len = u16::from_le_bytes(bytes.get(off..off + 2)?.try_into().ok()?) as usize; + off += 2; + + let type_id = std::str::from_utf8(bytes.get(off..off + type_id_len)?).ok()?; + off += type_id_len; + + let type_version = u32::from_le_bytes(bytes.get(off..off + 4)?.try_into().ok()?); + off += 4; + + Some(ParsedEnvelope { + type_id, + type_version, + body_offset: off, + }) +} + +/// Write the envelope for `type_id`/`type_version`, returning the number of +/// bytes written (the body's starting offset). +fn write_envelope(writer: &mut dyn Write, type_id: &str, type_version: u32) -> Result { + let type_id_len = u16::try_from(type_id.len()).map_err(|_| { + Error::io(format!( + "cache codec type_id too long ({} bytes, max {})", + type_id.len(), + u16::MAX + )) + })?; + + writer.write_all(&MAGIC)?; + writer.write_all(&[ENVELOPE_VERSION])?; + writer.write_all(&type_id_len.to_le_bytes())?; + writer.write_all(type_id.as_bytes())?; + writer.write_all(&type_version.to_le_bytes())?; + + Ok(4 + 1 + 2 + type_id.len() + 4) +} + +// --------------------------------------------------------------------------- +// CacheDecode — first-class cache-miss outcome +// --------------------------------------------------------------------------- + +/// Why a cache entry could not be decoded into the expected type. +/// +/// Carried by [`CacheDecode::Miss`] so backends can emit targeted metrics +/// (e.g. distinguish "evicting due to a stale format" from "type collision") +/// without re-parsing. Every reason maps to the same behavior — recompute via +/// the loader — so callers that don't care can ignore it. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CacheMissReason { + /// Absent or wrong magic, unknown `envelope_version`, truncated framing, or + /// a non-utf8 `type_id`. Typically an entry written by a pre-stabilization + /// or otherwise foreign build. + InvalidEnvelope, + /// Well-formed envelope, but its `type_id` names a different entry type than + /// the codec reading it. + TypeMismatch, + /// Written by a newer build whose `type_version` this build does not + /// understand and must not attempt to interpret. + VersionTooNew, + /// Envelope validated, but the body failed to decode (truncation, a + /// malformed protobuf header, an IPC error, etc.). + BodyError, +} + +/// Outcome of deserializing a cache entry. +/// +/// `Miss` means the bytes could not be confidently decoded into `T`; the +/// [`CacheMissReason`] says why. A backend treats any `Miss` exactly like a key +/// that was never present: recompute via the loader. +#[derive(Debug)] +pub enum CacheDecode { + Hit(T), + Miss(CacheMissReason), +} + +impl CacheDecode { + pub fn hit(self) -> Option { + match self { + Self::Hit(v) => Some(v), + Self::Miss(_) => None, + } + } +} // --------------------------------------------------------------------------- // CacheCodecImpl — trait for serializable cache entry types @@ -18,31 +190,40 @@ use crate::Result; /// Serialization trait for cache entries. /// -/// **Experimental**: the serialized format is not stable and may change -/// between releases without notice. +/// **Experimental**: the serialized format is not yet covered by a stability +/// guarantee and may change between releases. When it does stabilize, the +/// rules are: `TYPE_ID`, protobuf field numbers, and enum values are +/// append-only forever; format changes that protobuf cannot express +/// transparently bump [`CURRENT_VERSION`](Self::CURRENT_VERSION). /// -/// Implement this on concrete types that need to survive serialization -/// through a persistent cache backend. Then wire it into a [`CacheKey`](super::CacheKey) -/// via [`CacheCodec::from_impl`]: +/// Implement this on concrete types that need to survive serialization through +/// a persistent cache backend, then wire it into a +/// [`CacheKey`](super::CacheKey) via [`CacheCodec::from_impl`]. /// -/// ```ignore -/// impl CacheCodecImpl for MyData { -/// fn serialize(&self, w: &mut dyn Write) -> Result<()> { /* ... */ } -/// fn deserialize(data: &Bytes) -> Result { /* ... */ } -/// } -/// -/// impl CacheKey for MyDataKey { -/// type ValueType = MyData; -/// fn codec() -> Option { -/// Some(CacheCodec::from_impl::()) -/// } -/// // ... -/// } -/// ``` +/// The envelope (magic/version/type_id/type_version) is written and validated +/// by the [`CacheCodec`] wrapper. [`serialize`](Self::serialize) writes only +/// the body — a header followed by sections in a fixed, version-keyed order — +/// and [`deserialize`](Self::deserialize) reads them back in that same order. +/// The read sequence mirroring the write sequence for each `type_version` is +/// the invariant the implementor owns. pub trait CacheCodecImpl: Send + Sync { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()>; + /// Stable identity for this entry type. **Must not change once shipped.** + /// This is a deliberate author-assigned string, not `std::any::type_name` + /// (which is not stable across compiler versions). + const TYPE_ID: &'static str; + + /// Body schema version this build writes. Bump when the body layout + /// changes in a way protobuf field additions cannot express transparently + /// (adding/removing/reordering sections, a raw-blob encoding change, etc.). + const CURRENT_VERSION: u32; + + /// Write the body: a header, then sections in a fixed order. + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()>; - fn deserialize(data: &Bytes) -> Result + /// Reconstruct from the body. Branch on + /// [`reader.version()`](CacheEntryReader::version) for backward compat; + /// sections are read in write order. + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result where Self: Sized; } @@ -55,25 +236,31 @@ pub(crate) type ArcAny = Arc; /// Type-erased codec for serializing and deserializing cache entries. /// -/// `CacheCodec` is two plain function pointers — it is `Copy` and has no -/// heap allocation. Construct one via [`CacheCodec::from_impl`] for types -/// that implement [`CacheCodecImpl`], or [`CacheCodec::new`] for custom -/// cases (e.g. when the orphan rule prevents a direct impl). +/// `CacheCodec` carries the entry's stable `type_id`/`version` plus two plain +/// function pointers — it is `Copy` and has no heap allocation. Construct one +/// via [`CacheCodec::from_impl`] for types that implement [`CacheCodecImpl`], +/// or [`CacheCodec::new`] for custom cases (e.g. when the orphan rule prevents +/// a direct impl). #[derive(Copy, Clone)] pub struct CacheCodec { - pub(crate) serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - pub(crate) deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, } impl std::fmt::Debug for CacheCodec { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CacheCodec").finish_non_exhaustive() + f.debug_struct("CacheCodec") + .field("type_id", &self.type_id) + .field("version", &self.version) + .finish_non_exhaustive() } } fn serialize_via_impl( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> Result<()> { let val = any .downcast_ref::() @@ -81,44 +268,278 @@ fn serialize_via_impl( val.serialize(writer) } -fn deserialize_via_impl(data: &Bytes) -> Result { - let val = T::deserialize(data)?; +fn deserialize_via_impl( + reader: &mut CacheEntryReader<'_>, +) -> Result { + let val = T::deserialize(reader)?; Ok(Arc::new(val) as ArcAny) } impl CacheCodec { - /// Create a `CacheCodec` from plain function pointers. + /// Create a `CacheCodec` from explicit body function pointers. /// /// Prefer [`from_impl`](Self::from_impl) when the value type implements /// [`CacheCodecImpl`]. Use this for types where a direct impl isn't - /// possible (e.g. orphan rule prevents it). + /// possible (e.g. the orphan rule prevents it). `type_id` and `version` + /// play the same role as the corresponding [`CacheCodecImpl`] constants. pub fn new( - serialize: fn(&ArcAny, &mut dyn std::io::Write) -> Result<()>, - deserialize: fn(&Bytes) -> Result, + type_id: &'static str, + version: u32, + serialize_body: fn(&ArcAny, &mut CacheEntryWriter<'_>) -> Result<()>, + deserialize_body: fn(&mut CacheEntryReader<'_>) -> Result, ) -> Self { Self { - serialize, - deserialize, + type_id, + version, + serialize_body, + deserialize_body, } } /// Create a `CacheCodec` from a [`CacheCodecImpl`] implementation. - /// - /// For **sized** types stored directly in the cache. The codec - /// downcasts `&dyn Any` to `&T` for serialization and returns `Arc` - /// from deserialization. pub fn from_impl() -> Self { Self { - serialize: serialize_via_impl::, - deserialize: deserialize_via_impl::, + type_id: T::TYPE_ID, + version: T::CURRENT_VERSION, + serialize_body: serialize_via_impl::, + deserialize_body: deserialize_via_impl::, } } - pub fn serialize(&self, value: &ArcAny, writer: &mut dyn std::io::Write) -> Result<()> { - (self.serialize)(value, writer) + /// Serialize `value` into `writer`: envelope first, then the body. + pub fn serialize(&self, value: &ArcAny, writer: &mut dyn Write) -> Result<()> { + let body_offset = write_envelope(writer, self.type_id, self.version)?; + let mut entry_writer = CacheEntryWriter::with_pos(writer, body_offset); + (self.serialize_body)(value, &mut entry_writer) + } + + /// Deserialize an entry from `data`. + /// + /// Never fails: any non-fatal failure to interpret the bytes becomes a + /// [`CacheDecode::Miss`] with the reason why (see [`CacheMissReason`]). + /// Reading from an in-memory [`Bytes`] cannot do I/O, so there is no fault + /// channel — a miss is the only non-`Hit` outcome. + pub fn deserialize(&self, data: &Bytes) -> CacheDecode { + let Some(envelope) = parse_envelope(data) else { + log::debug!("cache entry rejected: missing or invalid envelope"); + return CacheDecode::Miss(CacheMissReason::InvalidEnvelope); + }; + + if envelope.type_id != self.type_id { + log::debug!( + "cache entry type_id mismatch: got {:?}, expected {:?}", + envelope.type_id, + self.type_id + ); + return CacheDecode::Miss(CacheMissReason::TypeMismatch); + } + + // A version newer than this build writes was produced by a newer build + // whose body layout we cannot assume to understand. Older/equal versions + // are the impl's responsibility to handle (branching on reader.version()). + if envelope.type_version > self.version { + log::debug!( + "cache entry {:?} has unsupported type_version {} (this build writes {})", + self.type_id, + envelope.type_version, + self.version + ); + return CacheDecode::Miss(CacheMissReason::VersionTooNew); + } + + let mut reader = CacheEntryReader::new(data, envelope.body_offset, envelope.type_version); + match (self.deserialize_body)(&mut reader) { + Ok(value) => CacheDecode::Hit(value), + Err(e) => { + log::debug!( + "cache entry {:?} v{} failed to decode: {e}", + self.type_id, + envelope.type_version + ); + CacheDecode::Miss(CacheMissReason::BodyError) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// A trivial codec used to exercise the envelope and miss semantics + /// without pulling in arrow-backed payloads. + #[derive(Debug, PartialEq)] + struct Widget { + n: u32, + } + + impl CacheCodecImpl for Widget { + const TYPE_ID: &'static str = "test.Widget"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, writer: &mut CacheEntryWriter<'_>) -> Result<()> { + writer.write_raw(&self.n.to_le_bytes()) + } + + fn deserialize(reader: &mut CacheEntryReader<'_>) -> Result { + let bytes = reader.read_raw()?; + let n = u32::from_le_bytes( + bytes + .as_ref() + .try_into() + .map_err(|_| Error::io("bad widget".to_string()))?, + ); + Ok(Self { n }) + } + } + + fn serialize_widget(widget: &Widget) -> Bytes { + let codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(Widget { n: widget.n }); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + Bytes::from(buf) + } + + /// The miss reason, or `None` if the decode was a hit. + fn miss_reason(data: &Bytes) -> Option { + match deserialize_widget(data) { + CacheDecode::Hit(_) => None, + CacheDecode::Miss(reason) => Some(reason), + } } - pub fn deserialize(&self, data: &Bytes) -> Result { - (self.deserialize)(data) + fn deserialize_widget(data: &Bytes) -> CacheDecode { + let codec = CacheCodec::from_impl::(); + match codec.deserialize(data) { + CacheDecode::Hit(any) => { + CacheDecode::Hit(Arc::try_unwrap(any.downcast::().unwrap()).unwrap()) + } + CacheDecode::Miss(reason) => CacheDecode::Miss(reason), + } + } + + #[test] + fn envelope_roundtrip_hits() { + let bytes = serialize_widget(&Widget { n: 0xDEADBEEF }); + // Sanity: the entry starts with the magic. + assert_eq!(&bytes[..4], b"LCE1"); + let decoded = deserialize_widget(&bytes).hit().unwrap(); + assert_eq!(decoded, Widget { n: 0xDEADBEEF }); + } + + #[test] + fn has_cache_envelope_detects_magic() { + let bytes = serialize_widget(&Widget { n: 1 }); + assert!(has_cache_envelope(&bytes)); + assert!(has_cache_envelope(&MAGIC)); // exactly the magic, nothing after + assert!(!has_cache_envelope(b"LCE")); // too short + assert!(!has_cache_envelope(b"JUNK and more")); + assert!(!has_cache_envelope(&[])); + } + + #[test] + fn wrong_magic_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[0] = b'X'; + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn pre_stabilization_blob_is_miss() { + // An old unstable blob led with a small u64 LE length prefix (a JSON + // header of tens of bytes) — no magic. It must self-heal to a miss. + let mut blob = Vec::new(); + blob.extend_from_slice(&(42u64).to_le_bytes()); + blob.extend_from_slice(&[0u8; 42]); + assert_eq!( + miss_reason(&Bytes::from(blob)), + Some(CacheMissReason::InvalidEnvelope) + ); + + // A different unstable shape led with a small u8 tag (0/1/2). + assert_eq!( + miss_reason(&Bytes::from(vec![0u8, 1, 2, 3])), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn unknown_envelope_version_is_miss() { + let mut bytes = serialize_widget(&Widget { n: 7 }).to_vec(); + bytes[4] = 0xFF; // envelope_version byte + assert_eq!( + miss_reason(&Bytes::from(bytes)), + Some(CacheMissReason::InvalidEnvelope) + ); + } + + #[test] + fn type_id_mismatch_is_miss() { + // Hand-build an envelope with a foreign type_id but valid framing. + let mut buf = Vec::new(); + write_envelope(&mut buf, "some.OtherType", 1).unwrap(); + buf.extend_from_slice(&(4u64).to_le_bytes()); + buf.extend_from_slice(&99u32.to_le_bytes()); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::TypeMismatch) + ); + } + + #[test] + fn unsupported_future_type_version_is_miss() { + // An entry written by a newer build (higher type_version) must miss + // rather than be misread by this build. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION + 1).unwrap(); + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &9u32.to_le_bytes()).unwrap(); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::VersionTooNew) + ); + } + + #[test] + fn truncated_envelope_is_miss() { + let bytes = serialize_widget(&Widget { n: 7 }); + for cut in [0, 1, 4, 5, 7, 9] { + assert_eq!( + miss_reason(&bytes.slice(..cut.min(bytes.len()))), + Some(CacheMissReason::InvalidEnvelope), + "truncating to {cut} bytes should miss as InvalidEnvelope" + ); + } + } + + #[test] + fn body_decode_error_is_miss() { + // Valid envelope, but the body is too short for the widget. + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, Widget::CURRENT_VERSION).unwrap(); + buf.extend_from_slice(&(1u64).to_le_bytes()); + buf.push(0u8); + assert_eq!( + miss_reason(&Bytes::from(buf)), + Some(CacheMissReason::BodyError) + ); + } + + #[test] + fn reader_exposes_envelope_version() { + // type_version travels through the envelope to reader.version(). + let mut buf = Vec::new(); + write_envelope(&mut buf, Widget::TYPE_ID, 7).unwrap(); + let body_off = buf.len(); + // A widget body so the codec can decode it. + lance_arrow::ipc::write_len_prefixed_bytes(&mut buf, &5u32.to_le_bytes()).unwrap(); + let data = Bytes::from(buf); + + let mut r = CacheEntryReader::new(&data, body_off, 7); + assert_eq!(r.version(), 7); + assert_eq!(r.read_raw().unwrap().as_ref(), 5u32.to_le_bytes()); } } diff --git a/rust/lance-core/src/cache/entry_io.rs b/rust/lance-core/src/cache/entry_io.rs new file mode 100644 index 00000000000..fe91b11ca7d --- /dev/null +++ b/rust/lance-core/src/cache/entry_io.rs @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Streaming readers/writers for cache entry bodies. +//! +//! [`CacheCodecImpl`](super::CacheCodecImpl) bodies are written and read +//! through these wrappers. They keep serialization streaming (no buffering of +//! the whole entry) and reads zero-copy (sections borrow from the input +//! [`Bytes`]), while tracking the byte position needed to keep Arrow IPC +//! sections 64-byte aligned (see [`lance_arrow::ipc`]). +//! +//! Body layout primitives: +//! +//! ```text +//! HEADER : [header_len: u32 LE][header proto bytes] +//! ARROW_IPC : [pad to 64B][self-delimiting IPC stream] +//! RAW_BLOB : [len: u64 LE][bytes] +//! ``` + +use std::io::Write; + +use arrow_array::RecordBatch; +use bytes::Bytes; +use prost::Message; + +use crate::{Error, Result}; + +/// Writes a cache entry body: a header followed by sections, streaming +/// directly to the underlying writer. +/// +/// The envelope is written by the [`CacheCodec`](super::CacheCodec) wrapper +/// before this writer is handed to +/// [`CacheCodecImpl::serialize`](super::CacheCodecImpl::serialize). +pub struct CacheEntryWriter<'a> { + writer: &'a mut dyn Write, + /// Absolute byte offset within the entry, used to align IPC sections. + pos: usize, +} + +impl<'a> CacheEntryWriter<'a> { + /// Create a writer positioned at the start of an entry (offset 0). + /// + /// Use this for nested serialization into a standalone buffer. The + /// envelope-aware entry point is [`CacheCodec::serialize`](super::CacheCodec::serialize). + pub fn new(writer: &'a mut dyn Write) -> Self { + Self { writer, pos: 0 } + } + + /// Create a writer whose section alignment accounts for `pos` bytes + /// already written ahead of the body (i.e. the envelope). + pub(crate) fn with_pos(writer: &'a mut dyn Write, pos: usize) -> Self { + Self { writer, pos } + } + + /// Write a single discriminant byte (e.g. a variant tag). + pub fn write_u8(&mut self, value: u8) -> Result<()> { + self.writer.write_all(&[value])?; + self.pos += 1; + Ok(()) + } + + /// Write a protobuf header as `[len: u32 LE][bytes]`. + pub fn write_header(&mut self, header: &P) -> Result<()> { + let bytes = header.encode_to_vec(); + let len = u32::try_from(bytes.len()) + .map_err(|_| Error::io(format!("cache header too large: {} bytes", bytes.len())))?; + self.writer.write_all(&len.to_le_bytes())?; + self.writer.write_all(&bytes)?; + self.pos += 4 + bytes.len(); + Ok(()) + } + + /// Write `batch` as a 64-byte-aligned Arrow IPC section. + pub fn write_ipc(&mut self, batch: &RecordBatch) -> Result<()> { + lance_arrow::ipc::write_ipc_section(self.writer, &mut self.pos, batch) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write `batches` as a single 64-byte-aligned multi-batch Arrow IPC + /// section. The iterator must yield at least one batch. + pub fn write_ipc_batches(&mut self, batches: I) -> Result<()> + where + I: IntoIterator, + { + lance_arrow::ipc::write_ipc_section_batches(self.writer, &mut self.pos, batches) + .map_err(|e| Error::io(e.to_string())) + } + + /// Write a raw blob as `[len: u64 LE][bytes]`. + /// + /// Only for byte payloads that already have their own stable, portable + /// encoding (e.g. a roaring bitmap, a varint-packed stream). + pub fn write_raw(&mut self, bytes: &[u8]) -> Result<()> { + lance_arrow::ipc::write_len_prefixed_bytes(self.writer, bytes) + .map_err(|e| Error::io(e.to_string()))?; + self.pos += 8 + bytes.len(); + Ok(()) + } + + /// The underlying writer, for a payload that carries its own framing. + /// + /// Use this only when the codec writes a self-delimiting or whole-body + /// payload — e.g. streaming a roaring bitmap as the entire body, where the + /// length prefix of [`write_raw`](Self::write_raw) would be redundant and + /// buffering to measure that length would force an extra copy. For + /// structured bodies prefer [`write_header`](Self::write_header) / + /// [`write_ipc`](Self::write_ipc) / [`write_raw`](Self::write_raw), which + /// give you versioning and 64-byte IPC alignment. + /// + /// Bytes written through this do **not** advance the section-alignment + /// position, so it must not be interleaved with [`write_ipc`](Self::write_ipc). + pub fn raw_writer(&mut self) -> &mut dyn Write { + self.writer + } +} + +/// Reads a cache entry body, tracking an offset into the input and exposing +/// the entry's `type_version` so implementors can branch for backward compat. +/// +/// All reads are zero-copy: returned [`Bytes`] and the buffers behind decoded +/// [`RecordBatch`]es borrow from the input allocation. +pub struct CacheEntryReader<'a> { + data: &'a Bytes, + offset: usize, + version: u32, +} + +impl<'a> CacheEntryReader<'a> { + /// Create a reader over `data`, starting at body byte `offset`, for an + /// entry written at `version`. + pub fn new(data: &'a Bytes, offset: usize, version: u32) -> Self { + Self { + data, + offset, + version, + } + } + + /// The `type_version` from the envelope. Branch on this for backward compat. + pub fn version(&self) -> u32 { + self.version + } + + /// Read a single discriminant byte written by [`CacheEntryWriter::write_u8`]. + pub fn read_u8(&mut self) -> Result { + let bytes = self.data.as_ref(); + let v = *bytes + .get(self.offset) + .ok_or_else(|| Error::io("cache entry: truncated, missing tag byte".to_string()))?; + self.offset += 1; + Ok(v) + } + + /// Read a protobuf header written by [`CacheEntryWriter::write_header`]. + pub fn read_header(&mut self) -> Result

{ + let bytes = self.data.as_ref(); + let len_end = self + .offset + .checked_add(4) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated length prefix".to_string()))?; + let len = u32::from_le_bytes(bytes[self.offset..len_end].try_into().unwrap()) as usize; + let data_end = len_end + .checked_add(len) + .filter(|&e| e <= bytes.len()) + .ok_or_else(|| Error::io("cache header: truncated body".to_string()))?; + let msg = P::decode(&bytes[len_end..data_end]) + .map_err(|e| Error::io(format!("cache header decode failed: {e}")))?; + self.offset = data_end; + Ok(msg) + } + + /// Read one [`RecordBatch`] from a 64-byte-aligned IPC section. + pub fn read_ipc(&mut self) -> Result { + lance_arrow::ipc::read_ipc_section_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read all [`RecordBatch`]es from a 64-byte-aligned multi-batch IPC + /// section written by [`CacheEntryWriter::write_ipc_batches`]. + pub fn read_ipc_batches(&mut self) -> Result> { + lance_arrow::ipc::read_ipc_section_batches_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// Read a raw blob written by [`CacheEntryWriter::write_raw`], zero-copy. + pub fn read_raw(&mut self) -> Result { + lance_arrow::ipc::read_len_prefixed_bytes_at(self.data, &mut self.offset) + .map_err(|e| Error::io(e.to_string())) + } + + /// The not-yet-consumed body bytes as a zero-copy slice. + /// + /// For a payload that carries its own framing and is parsed with the + /// codec's own cursor — the read counterpart of + /// [`CacheEntryWriter::raw_writer`]. For structured bodies prefer + /// [`read_header`](Self::read_header) / [`read_ipc`](Self::read_ipc) / + /// [`read_raw`](Self::read_raw). + pub fn body(&self) -> Bytes { + self.data.slice(self.offset..) + } +} diff --git a/rust/lance-core/src/cache/mod.rs b/rust/lance-core/src/cache/mod.rs index bea700cad90..07038c6e9d5 100644 --- a/rust/lance-core/src/cache/mod.rs +++ b/rust/lance-core/src/cache/mod.rs @@ -47,10 +47,14 @@ pub mod backend; pub mod codec; +mod entry_io; mod moka; pub use backend::{CacheBackend, CacheEntry, CacheKeyIterator, InternalCacheKey}; -pub use codec::{CacheCodec, CacheCodecImpl}; +pub use codec::{ + CacheCodec, CacheCodecImpl, CacheDecode, CacheMissReason, MAGIC, has_cache_envelope, +}; +pub use entry_io::{CacheEntryReader, CacheEntryWriter}; pub use moka::MokaCacheBackend; use std::borrow::Cow; diff --git a/rust/lance-index/build.rs b/rust/lance-index/build.rs index 0617de8c806..b47744f7b5a 100644 --- a/rust/lance-index/build.rs +++ b/rust/lance-index/build.rs @@ -6,6 +6,9 @@ use std::io::Result; fn main() -> Result<()> { println!("cargo:rerun-if-changed=protos"); + // Cache-entry protos are library-internal serialization, not part of the + // on-disk format spec, so they live here rather than in the shared `protos/`. + println!("cargo:rerun-if-changed=protos-cache"); #[cfg(feature = "protoc")] // Use vendored protobuf compiler if requested. @@ -17,8 +20,12 @@ fn main() -> Result<()> { prost_build.protoc_arg("--experimental_allow_proto3_optional"); prost_build.enable_type_names(); prost_build.compile_protos( - &["./protos/index.proto", "./protos/index_old.proto"], - &["./protos"], + &[ + "./protos/index.proto", + "./protos/index_old.proto", + "./protos-cache/cache.proto", + ], + &["./protos", "./protos-cache"], )?; let rust_toolchain = env::var("RUSTUP_TOOLCHAIN") diff --git a/rust/lance-index/protos-cache/cache.proto b/rust/lance-index/protos-cache/cache.proto new file mode 100644 index 00000000000..b24a27055d7 --- /dev/null +++ b/rust/lance-index/protos-cache/cache.proto @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +// Protobuf headers for serialized index cache entries. +// +// These messages describe the *cache* serialization format, not the on-disk +// Lance format spec, so they live with the library (lance-index) rather than in +// the top-level `protos/` spec folder. +// +// Field numbers and enum values are append-only across all messages here: never +// renumber or reuse them. A change the proto cannot express transparently +// (adding/removing/reordering the IPC/raw sections that follow a header) must +// bump the relevant codec's `CURRENT_VERSION` instead. + +syntax = "proto3"; + +package lance.index.cache; + +// --------------------------------------------------------------------------- +// Full-text search (FTS) posting lists +// --------------------------------------------------------------------------- + +// Header for a serialized `CompressedPostingList` cache entry. +message CompressedPostingHeader { + float max_score = 1; + uint32 length = 2; + PostingTailCodec posting_tail_codec = 3; + PositionStorage position_storage = 4; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 5; +} + +// Header for a serialized `PlainPostingList` cache entry. Followed by an Arrow +// IPC section of (row_ids: UInt64, frequencies: Float32), then — when +// position_storage == POSITION_STORAGE_LEGACY — an IPC section of the per-doc +// position list. Plain postings never carry a shared position stream. +message PlainPostingHeader { + // Absent when the posting has no precomputed block-max score (the in-memory + // `max_score` is `None`); present otherwise. + optional float max_score = 1; + // POSITION_STORAGE_NONE or POSITION_STORAGE_LEGACY only. + PositionStorage position_storage = 2; +} + +// Header for a serialized standalone `Positions` cache entry. Followed by the +// position sections framed per `position_storage`, which is never +// POSITION_STORAGE_NONE for a standalone entry. +message PositionsHeader { + PositionStorage position_storage = 1; + // Only meaningful when position_storage == POSITION_STORAGE_SHARED. + PositionStreamCodec position_stream_codec = 2; +} + +// Header for a serialized `PostingListGroup`: a member count followed by that +// many `PostingList` bodies written inline. Each member body is +// self-delimiting, so members need no length prefixes, and writing them inline +// keeps their Arrow IPC sections 64-byte aligned within the group entry. +message PostingListGroupHeader { + uint32 count = 1; +} + +// Tail-block encoding of a compressed posting list. +enum PostingTailCodec { + POSTING_TAIL_CODEC_FIXED32 = 0; + POSTING_TAIL_CODEC_VARINT_DELTA = 1; +} + +// Encoding of a shared position stream's byte buffer. +enum PositionStreamCodec { + POSITION_STREAM_CODEC_VARINT_DOC_DELTA = 0; + POSITION_STREAM_CODEC_PACKED_DELTA = 1; +} + +// Which (if any) positions accompany the posting list, and how they are framed +// in the sections after the header. +enum PositionStorage { + POSITION_STORAGE_NONE = 0; + // Legacy per-doc positions as a single Arrow IPC section. + POSITION_STORAGE_LEGACY = 1; + // Shared stream: an Arrow IPC section of block offsets, then a raw blob of + // the (codec-encoded) position bytes. + POSITION_STORAGE_SHARED = 2; +} + +// --------------------------------------------------------------------------- +// Scalar indices +// --------------------------------------------------------------------------- + +// Header for a serialized `BTreeIndexState` cache entry, followed by a single +// Arrow IPC section holding the page-lookup batch. +message BTreeIndexHeader { + uint64 batch_size = 1; + // Whether an explicit page-range -> file mapping is present. Distinguishes a + // non-range-partitioned index (false) from a range-partitioned one whose map + // happens to be empty (true with no entries). + bool has_ranges_to_files = 2; + repeated RangeToFile ranges_to_files = 3; +} + +// One entry of a `BTreeIndexState` page-range -> file mapping. The range is +// inclusive on both ends (a `RangeInclusive`). +message RangeToFile { + uint32 start = 1; + uint32 end = 2; + uint32 page_offset = 3; + string path = 4; +} + +// --------------------------------------------------------------------------- +// Vector indices (IVF partitions) +// --------------------------------------------------------------------------- + +// Headers for serialized IVF partition cache entries (`PartitionEntry`). +// +// Each header is followed by 64-byte-aligned Arrow IPC sections in a fixed, +// version-keyed order (sub-index, then any quantizer-specific arrays, then the +// quantizer storage batches). + +// Distance metric a quantizer's storage was built for. +enum DistanceType { + DISTANCE_TYPE_L2 = 0; + DISTANCE_TYPE_COSINE = 1; + DISTANCE_TYPE_DOT = 2; + DISTANCE_TYPE_HAMMING = 3; +} + +// Rotation applied by a RabitQ quantizer. +enum RotationType { + ROTATION_TYPE_MATRIX = 0; + ROTATION_TYPE_FAST = 1; +} + +// Estimator a RabitQ quantizer uses at query time. +enum RabitQueryEstimator { + RABIT_QUERY_ESTIMATOR_RESIDUAL_QUERY = 0; + RABIT_QUERY_ESTIMATOR_RAW_QUERY = 1; +} + +// Product quantizer. Sections: sub-index IPC, codebook IPC, storage IPC. +message PqPartitionHeader { + DistanceType distance_type = 1; + uint32 nbits = 2; + uint64 num_sub_vectors = 3; + uint64 dimension = 4; + bool transposed = 5; +} + +// Flat (float) and flat-binary quantizers. Sections: sub-index IPC, storage IPC. +message FlatPartitionHeader { + DistanceType distance_type = 1; + uint64 dim = 2; +} + +// Scalar quantizer. Sections: sub-index IPC, storage IPC (possibly multi-batch). +message SqPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint64 dim = 3; + double bounds_start = 4; + double bounds_end = 5; +} + +// Header for a serialized IVF index state (`IvfIndexState`), followed by +// three raw blobs: the IVF model protobuf, the quantizer's extra-metadata +// buffer (may be empty), and the auxiliary IVF model protobuf. +message IvfStateHeader { + string index_file_path = 1; + string uuid = 2; + string distance_type = 3; + repeated string sub_index_metadata = 4; + string sub_index_type = 5; + string quantization_type = 6; + // Per-quantizer `Q::Metadata` as JSON. Kept as a string because the metadata + // type is generic over the quantizer; the proto envelope still provides + // additive evolution for the surrounding fields. + string quantizer_metadata_json = 7; + string cache_key_prefix = 8; + uint64 index_file_size = 9; + uint64 aux_file_size = 10; +} + +// RabitQ quantizer. Sections: sub-index IPC, rotate-matrix IPC (Matrix rotation +// only), storage IPC. +message RabitPartitionHeader { + DistanceType distance_type = 1; + uint32 num_bits = 2; + uint32 code_dim = 3; + RotationType rotation_type = 4; + // Fast-rotation sign vector; present only when rotation_type == + // ROTATION_TYPE_FAST (the Matrix case stores its rotation as an IPC section). + optional bytes fast_rotation_signs = 5; + // Estimator the RabitQ storage uses at query time (residual vs raw query). + RabitQueryEstimator query_estimator = 6; +} diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index c7cace92428..20e1c2692d9 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -68,6 +68,13 @@ pub mod pbold { include!(concat!(env!("OUT_DIR"), "/lance.table.rs")); } +/// Protobuf headers for serialized index cache entries (FTS posting lists, +/// scalar indices, and IVF vector partitions). +pub mod cache_pb { + #![allow(clippy::use_self)] + include!(concat!(env!("OUT_DIR"), "/lance.index.cache.rs")); +} + /// Generic methods common across all types of secondary indices /// #[async_trait] diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 1ae2faf6e6b..c2a6e80e82b 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -18,14 +18,13 @@ use bytes::Bytes; use datafusion::physical_plan::SendableRecordBatchStream; use datafusion_common::ScalarValue; use futures::{StreamExt, TryStreamExt, stream}; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::tokio::get_num_compute_intensive_cpus, }; @@ -212,6 +211,32 @@ impl BitmapIndexState { frag_reuse_index, ))) } + + /// Build a state directly from its parts, for codec tests in sibling + /// modules (e.g. the label-list index, which nests a bitmap state). + #[cfg(test)] + pub(crate) fn new_for_test( + index_map: BTreeMap, + null_map: RowAddrTreeMap, + value_type: DataType, + ) -> Result { + Ok(Self { + lookup_batch: build_lookup_batch(&index_map, &value_type)?, + null_map: Arc::new(null_map), + value_type, + index_map: Arc::new(index_map), + }) + } + + #[cfg(test)] + pub(crate) fn lookup_batch(&self) -> &RecordBatch { + &self.lookup_batch + } + + #[cfg(test)] + pub(crate) fn null_map(&self) -> &RowAddrTreeMap { + &self.null_map + } } fn build_lookup_batch( @@ -251,25 +276,27 @@ fn parse_lookup_batch(batch: &RecordBatch) -> Result, offsets: UInt64)] + /// RAW_BLOB : null_map (roaring tree map, portable encoding) + /// ARROW_IPC : (keys: , offsets: UInt64) /// ``` - /// The value type is recovered from the IPC stream schema. - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + /// The value type is recovered from the IPC section schema. + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let mut null_bytes = Vec::with_capacity(self.null_map.serialized_size()); self.null_map.serialize_into(&mut null_bytes)?; - write_len_prefixed_bytes(writer, &null_bytes)?; - write_ipc_stream(&self.lookup_batch, writer)?; + w.write_raw(&null_bytes)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let null_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let null_bytes = r.read_raw()?; let null_map = Arc::new(RowAddrTreeMap::deserialize_from(null_bytes.as_ref())?); - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; let value_type = lookup_batch.schema().field(0).data_type().clone(); let index_map = Arc::new(parse_lookup_batch(&lookup_batch)?); Ok(Self { @@ -1821,8 +1848,12 @@ mod tests { fn assert_state_roundtrips(state: &BitmapIndexState) { let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BitmapIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BitmapIndexState::CURRENT_VERSION); + let restored = BitmapIndexState::deserialize(&mut reader).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(&*restored.null_map, &*state.null_map); assert_eq!(restored.value_type, state.value_type); @@ -1856,6 +1887,53 @@ mod tests { assert_state_roundtrips(&empty_state); } + /// The lookup batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`] even though the envelope pushes the IPC section to a + /// non-aligned starting offset. + #[test] + fn test_bitmap_index_state_lookup_is_zero_copy() { + const ALIGN: usize = 64; + let mut index_map = BTreeMap::new(); + for k in 0..32i32 { + index_map.insert( + OrderableScalarValue(ScalarValue::Int32(Some(k))), + k as usize, + ); + } + let state = BitmapIndexState { + lookup_batch: build_lookup_batch(&index_map, &DataType::Int32).unwrap(), + null_map: Arc::new(RowAddrTreeMap::new()), + value_type: DataType::Int32, + index_map: Arc::new(index_map), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + // Model a backend reading into a 64-byte-aligned buffer. + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_bitmap_lazy_loading_and_cache() { // Create a temporary directory for the index diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 6d21e842e04..9abd69022c0 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -15,6 +15,7 @@ use super::{ OldIndexDataFilter, SargableQuery, ScalarIndex, ScalarIndexParams, SearchResult, compute_next_prefix, }; +use crate::cache_pb::{BTreeIndexHeader, RangeToFile}; use crate::{Index, IndexType}; use crate::{ frag_reuse::FragReuseIndex, @@ -52,11 +53,13 @@ use futures::{ future::BoxFuture, stream::{self}, }; -use lance_arrow::ipc::{read_ipc_stream_single_at, write_ipc_stream}; use lance_core::deepsize::DeepSizeOf; use lance_core::{ Error, ROW_ID, Result, - cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}, + cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, + }, error::LanceOptionExt, utils::{ tokio::get_num_compute_intensive_cpus, @@ -1402,106 +1405,58 @@ impl BTreeIndexState { } impl CacheCodecImpl for BTreeIndexState { - /// Wire format (no stability guarantees yet — the cache is rebuilt from - /// source on any version mismatch): + const TYPE_ID: &'static str = "lance.scalar.BTreeIndexState"; + const CURRENT_VERSION: u32 = 1; + + /// Wire format: /// ```text - /// u64 batch_size (LE) - /// u8 has_ranges (0 = None, 1 = Some) - /// if has_ranges: - /// u32 entry_count (LE) - /// per entry: u32 start | u32 end | u32 offset | u32 path_len | path bytes - /// lookup batch (Arrow IPC stream) + /// HEADER : BTreeIndexHeader proto (batch_size + page-range mapping) + /// ARROW_IPC : page-lookup batch /// ``` - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { - writer.write_all(&self.batch_size.to_le_bytes())?; - match &self.ranges_to_files { - None => writer.write_all(&[0u8])?, - Some(ranges) => { - writer.write_all(&[1u8])?; - let count = u32::try_from(ranges.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files exceeds u32::MAX entries") - })?; - writer.write_all(&count.to_le_bytes())?; - for (range, (path, page_offset)) in ranges.iter() { - writer.write_all(&range.start().to_le_bytes())?; - writer.write_all(&range.end().to_le_bytes())?; - writer.write_all(&page_offset.to_le_bytes())?; - let path_len = u32::try_from(path.len()).map_err(|_| { - Error::io("BTreeIndexState: ranges_to_files path exceeds u32::MAX bytes") - })?; - writer.write_all(&path_len.to_le_bytes())?; - writer.write_all(path.as_bytes())?; - } - } - } - write_ipc_stream(&self.lookup_batch, writer)?; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let ranges_to_files = match &self.ranges_to_files { + None => Vec::new(), + Some(ranges) => ranges + .iter() + .map(|(range, (path, page_offset))| RangeToFile { + start: *range.start(), + end: *range.end(), + page_offset: *page_offset, + path: path.clone(), + }) + .collect(), + }; + let header = BTreeIndexHeader { + batch_size: self.batch_size, + has_ranges_to_files: self.ranges_to_files.is_some(), + ranges_to_files, + }; + w.write_header(&header)?; + w.write_ipc(&self.lookup_batch)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let batch_size = read_u64_le(data, &mut offset)?; - let has_ranges = read_u8(data, &mut offset)?; - let ranges_to_files = match has_ranges { - 0 => None, - 1 => { - let count = read_u32_le(data, &mut offset)? as usize; - let mut entries = Vec::with_capacity(count); - for _ in 0..count { - let start = read_u32_le(data, &mut offset)?; - let end = read_u32_le(data, &mut offset)?; - let page_offset = read_u32_le(data, &mut offset)?; - let path_len = read_u32_le(data, &mut offset)? as usize; - let path = read_bytes(data, &mut offset, path_len)?; - let path = std::str::from_utf8(&path) - .map_err(|e| Error::io(format!("BTreeIndexState path: {e}")))? - .to_string(); - entries.push((start..=end, (path, page_offset))); - } - Some(Arc::new(entries.into_iter().collect())) - } - other => { - return Err(Error::io(format!( - "BTreeIndexState: invalid has_ranges tag {other}" - ))); - } + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: BTreeIndexHeader = r.read_header()?; + let ranges_to_files = if header.has_ranges_to_files { + let map: RangeInclusiveMap = header + .ranges_to_files + .into_iter() + .map(|entry| (entry.start..=entry.end, (entry.path, entry.page_offset))) + .collect(); + Some(Arc::new(map)) + } else { + None }; - let lookup_batch = read_ipc_stream_single_at(data, &mut offset)?; + let lookup_batch = r.read_ipc()?; Ok(Self { lookup_batch, - batch_size, + batch_size: header.batch_size, ranges_to_files, }) } } -fn read_bytes(data: &bytes::Bytes, offset: &mut usize, len: usize) -> Result { - if data.len() < *offset + len { - return Err(Error::io(format!( - "BTreeIndexState: short read of {len} bytes at offset {offset} (have {})", - data.len() - ))); - } - let slice = data.slice(*offset..*offset + len); - *offset += len; - Ok(slice) -} - -fn read_u8(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 1)?; - Ok(bytes[0]) -} - -fn read_u32_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 4)?; - Ok(u32::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - -fn read_u64_le(data: &bytes::Bytes, offset: &mut usize) -> Result { - let bytes = read_bytes(data, offset, 8)?; - Ok(u64::from_le_bytes(bytes.as_ref().try_into().unwrap())) -} - /// Cache key for a [`BTreeIndexState`]. The cache it is used with is already /// namespaced per-index, so the key string is a constant. struct BTreeIndexStateKey; @@ -3286,7 +3241,23 @@ mod tests { }; use crate::scalar::registry::ScalarIndexPlugin; use arrow_array::RecordBatch; - use lance_core::cache::{CacheCodecImpl, CacheKey}; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey}; + + /// Serialize a `BTreeIndexState` body (no envelope) for tests. + fn serialize_state(state: &BTreeIndexState) -> Vec { + let mut buf = Vec::new(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a `BTreeIndexState` body (no envelope) for tests. + fn deserialize_state(buf: Vec) -> lance_core::Result { + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, BTreeIndexState::CURRENT_VERSION); + BTreeIndexState::deserialize(&mut reader) + } use rangemap::RangeInclusiveMap; lance_testing::define_stage_event_progress!( @@ -5919,9 +5890,7 @@ mod tests { } fn assert_state_roundtrips(state: &BTreeIndexState) { - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(state)).unwrap(); assert_eq!(restored.lookup_batch, state.lookup_batch); assert_eq!(restored.batch_size, state.batch_size); assert_eq!(restored.ranges_to_files, state.ranges_to_files); @@ -5990,9 +5959,7 @@ mod tests { batch_size: index.batch_size, ranges_to_files: index.ranges_to_files.clone(), }; - let mut buf = Vec::new(); - state.serialize(&mut buf).unwrap(); - let restored = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap(); + let restored = deserialize_state(serialize_state(&state)).unwrap(); let reconstructed = restored .reconstruct(test_store.clone(), &LanceCache::no_cache(), None) .unwrap(); @@ -6028,18 +5995,57 @@ mod tests { assert_eq!(expected, actual); } + /// The lookup batch must decode zero-copy through the full envelope even + /// though the proto header pushes the IPC section to a non-aligned offset. + #[test] + fn test_btree_index_state_lookup_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let ranges: RangeInclusiveMap = + [(0..=99, ("part_0_page_file.lance".to_string(), 0))] + .into_iter() + .collect(); + let state = BTreeIndexState { + lookup_batch: sample_lookup_batch(), + batch_size: 8192, + ranges_to_files: Some(Arc::new(ranges)), + }; + + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(state); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.lookup_batch.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "lookup batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[test] - fn test_btree_index_state_rejects_invalid_has_ranges_tag() { - // u64 batch_size (any) then a bad has_ranges tag. + fn test_btree_index_state_rejects_truncated_header() { + // A header length prefix that overruns the buffer must error rather + // than panic or silently misread it. let mut buf = Vec::new(); - buf.extend_from_slice(&1000u64.to_le_bytes()); - buf.push(7u8); - let err = BTreeIndexState::deserialize(&bytes::Bytes::from(buf)).unwrap_err(); - let msg = err.to_string(); - assert!( - msg.contains("has_ranges") && msg.contains("7"), - "expected error to mention the bad has_ranges tag, got: {msg}" - ); + buf.extend_from_slice(&100u32.to_le_bytes()); // claims a 100-byte header + buf.extend_from_slice(&[0u8; 4]); // but only 4 bytes follow + assert!(deserialize_state(buf).is_err()); } #[tokio::test] diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 212ef6490be..045b4c95c55 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -13,9 +13,8 @@ use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; use datafusion_physical_expr::create_physical_expr; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::{read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream}; use lance_core::Result; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::deepsize::DeepSizeOf; use lance_core::utils::address::RowAddress; use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; @@ -236,32 +235,38 @@ impl FlatIndex { } impl CacheCodecImpl for FlatIndex { - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.scalar.FlatIndex"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { // Format: - // [len-prefixed all_addrs_map][len-prefixed null_addrs_map][batch IPC stream] - writer.write_all(&(self.all_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.all_addrs_map.serialize_into(&mut *writer)?; + // RAW_BLOB : all_addrs_map (roaring tree map) + // RAW_BLOB : null_addrs_map (roaring tree map) + // ARROW_IPC : data batch + let mut all_addrs_bytes = Vec::with_capacity(self.all_addrs_map.serialized_size()); + self.all_addrs_map.serialize_into(&mut all_addrs_bytes)?; + w.write_raw(&all_addrs_bytes)?; - writer.write_all(&(self.null_addrs_map.serialized_size() as u64).to_le_bytes())?; - self.null_addrs_map.serialize_into(&mut *writer)?; + let mut null_addrs_bytes = Vec::with_capacity(self.null_addrs_map.serialized_size()); + self.null_addrs_map.serialize_into(&mut null_addrs_bytes)?; + w.write_raw(&null_addrs_bytes)?; - write_ipc_stream(self.data.as_ref(), writer)?; + w.write_ipc(self.data.as_ref())?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result where Self: Sized, { - let mut offset = 0; - let all_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let all_addrs_bytes = r.read_raw()?; let all_addrs_map = RowAddrTreeMap::deserialize_from(all_addrs_bytes.as_ref())?; - let null_addrs_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let null_addrs_bytes = r.read_raw()?; let null_addrs_map = RowAddrTreeMap::deserialize_from(null_addrs_bytes.as_ref())?; - let batch = read_ipc_stream_single_at(data, &mut offset)?; + let batch = r.read_ipc()?; let df_schema = DFSchema::try_from(batch.schema())?; @@ -309,8 +314,12 @@ mod tests { fn assert_roundtrips(index: &FlatIndex) { let mut buf = Vec::new(); - index.serialize(&mut buf).unwrap(); - let restored = FlatIndex::deserialize(&bytes::Bytes::from(buf)).unwrap(); + index + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = bytes::Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, FlatIndex::CURRENT_VERSION); + let restored = FlatIndex::deserialize(&mut reader).unwrap(); assert_eq!(restored.data, index.data); assert_eq!(restored.all_addrs_map, index.all_addrs_map); @@ -335,6 +344,41 @@ mod tests { assert_roundtrips(&FlatIndex::try_new(empty).unwrap()); } + /// The data batch must decode zero-copy through the full envelope-bearing + /// [`CacheCodec`], even though the two roaring blobs and the envelope push + /// the IPC section to a non-aligned starting offset. + #[test] + fn test_flat_index_data_is_zero_copy() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let index = example_index(); + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(index); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.data.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "data batch buffer was realigned out of the input — misaligned IPC section", + ); + } + } + } + #[tokio::test] async fn test_equality() { check_index(&SargableQuery::Equals(ScalarValue::from(100)), &[0]).await; diff --git a/rust/lance-index/src/scalar/inverted/cache_codec.rs b/rust/lance-index/src/scalar/inverted/cache_codec.rs index 74cfc98ef7b..a676455d5c9 100644 --- a/rust/lance-index/src/scalar/inverted/cache_codec.rs +++ b/rust/lance-index/src/scalar/inverted/cache_codec.rs @@ -4,16 +4,24 @@ //! Cache codec impls for FTS index entries. //! //! Serializes [`PostingList`] and [`Positions`] cache values for persistent -//! cache backends. The format is a small variant tag plus a JSON header for -//! scalar metadata, with Arrow-backed payload sections written as zero-copy -//! Arrow IPC streams via [`lance_arrow::ipc`]. The raw byte buffer inside -//! [`SharedPositionStream`] is written via [`write_len_prefixed_bytes`] and -//! read back via [`read_len_prefixed_bytes_at`] -- both zero-copy slices into -//! the input `Bytes` allocation. +//! cache backends, behind the stabilized envelope written by +//! [`CacheCodec`](lance_core::cache::CacheCodec). //! -//! This is the FTS counterpart of `partition_serde.rs` for vector indices. +//! Every variant uses a protobuf header (see `protos-cache/cache.proto`, with the +//! tail/position codecs and position-storage kind as proto enums) followed by +//! 64-byte-aligned Arrow IPC sections and, where applicable, raw blobs: +//! +//! - the compressed posting list: an IPC section for `blocks`, then the +//! position sections (legacy IPC, or shared block-offsets IPC + a raw blob of +//! the [`SharedPositionStream`] byte buffer, which has its own portable +//! encoding); +//! - the plain posting list: an IPC section of `(row_ids, frequencies)`, then +//! an optional legacy position IPC section; +//! - the standalone [`Positions`] codec: the position sections alone. +//! +//! All sections read back zero-copy via [`lance_arrow::ipc`]. This is the FTS +//! counterpart of `partition_serde.rs` for vector indices. -use std::io::Write; use std::sync::Arc; use arrow_array::cast::AsArray; @@ -22,14 +30,14 @@ use arrow_array::{ Array, Float32Array, LargeBinaryArray, ListArray, RecordBatch, UInt32Array, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; -use serde::{Deserialize, Serialize}; + +use crate::cache_pb::{ + CompressedPostingHeader, PlainPostingHeader, PositionStorage as PbPositionStorage, + PositionStreamCodec as PbPositionStreamCodec, PositionsHeader, PostingListGroupHeader, + PostingTailCodec as PbPostingTailCodec, +}; use super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -43,86 +51,43 @@ use super::index::{ const POSTING_VARIANT_PLAIN: u8 = 0; const POSTING_VARIANT_COMPRESSED: u8 = 1; -const POSITIONS_TAG_NONE: u8 = 0; -const POSITIONS_TAG_LEGACY: u8 = 1; -const POSITIONS_TAG_SHARED: u8 = 2; - -const POSTING_TAIL_CODEC_FIXED32: u8 = 0; -const POSTING_TAIL_CODEC_VARINT_DELTA: u8 = 1; - -const POSITION_STREAM_CODEC_VARINT_DOC_DELTA: u8 = 0; -const POSITION_STREAM_CODEC_PACKED_DELTA: u8 = 1; - // --------------------------------------------------------------------------- -// Codec enum byte mappings +// Codec enum mappings // --------------------------------------------------------------------------- -fn posting_tail_codec_to_u8(c: PostingTailCodec) -> u8 { - match c { - PostingTailCodec::Fixed32 => POSTING_TAIL_CODEC_FIXED32, - PostingTailCodec::VarintDelta => POSTING_TAIL_CODEC_VARINT_DELTA, - } -} +// Posting lists carry their discriminants as protobuf enums in the header; +// these map to/from the in-memory Rust enums. -fn u8_to_posting_tail_codec(v: u8) -> Result { - match v { - POSTING_TAIL_CODEC_FIXED32 => Ok(PostingTailCodec::Fixed32), - POSTING_TAIL_CODEC_VARINT_DELTA => Ok(PostingTailCodec::VarintDelta), - _ => Err(Error::io(format!("unknown posting tail codec: {v}"))), +fn posting_tail_codec_to_proto(c: PostingTailCodec) -> PbPostingTailCodec { + match c { + PostingTailCodec::Fixed32 => PbPostingTailCodec::Fixed32, + PostingTailCodec::VarintDelta => PbPostingTailCodec::VarintDelta, } } -fn position_stream_codec_to_u8(c: PositionStreamCodec) -> u8 { +fn proto_to_posting_tail_codec(c: PbPostingTailCodec) -> PostingTailCodec { match c { - PositionStreamCodec::VarintDocDelta => POSITION_STREAM_CODEC_VARINT_DOC_DELTA, - PositionStreamCodec::PackedDelta => POSITION_STREAM_CODEC_PACKED_DELTA, + PbPostingTailCodec::Fixed32 => PostingTailCodec::Fixed32, + PbPostingTailCodec::VarintDelta => PostingTailCodec::VarintDelta, } } -fn u8_to_position_stream_codec(v: u8) -> Result { - match v { - POSITION_STREAM_CODEC_VARINT_DOC_DELTA => Ok(PositionStreamCodec::VarintDocDelta), - POSITION_STREAM_CODEC_PACKED_DELTA => Ok(PositionStreamCodec::PackedDelta), - _ => Err(Error::io(format!("unknown position stream codec: {v}"))), +fn position_stream_codec_to_proto(c: PositionStreamCodec) -> PbPositionStreamCodec { + match c { + PositionStreamCodec::VarintDocDelta => PbPositionStreamCodec::VarintDocDelta, + PositionStreamCodec::PackedDelta => PbPositionStreamCodec::PackedDelta, } } -// --------------------------------------------------------------------------- -// Header / tag I/O helpers (mirrors partition_serde.rs) -// --------------------------------------------------------------------------- - -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let bytes = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &bytes)?; - Ok(()) -} - -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes) - .map_err(|e| Error::io(format!("failed to deserialize cache header: {e}"))) -} - -fn write_u8(writer: &mut dyn Write, value: u8) -> Result<()> { - writer - .write_all(&[value]) - .map_err(|e| Error::io(format!("failed to write tag byte: {e}"))) -} - -fn read_u8(data: &Bytes, offset: &mut usize) -> Result { - let bytes = data.as_ref(); - if *offset >= bytes.len() { - return Err(Error::io( - "truncated cache entry: missing tag byte".to_string(), - )); +fn proto_to_position_stream_codec(c: PbPositionStreamCodec) -> PositionStreamCodec { + match c { + PbPositionStreamCodec::VarintDocDelta => PositionStreamCodec::VarintDocDelta, + PbPositionStreamCodec::PackedDelta => PositionStreamCodec::PackedDelta, } - let v = bytes[*offset]; - *offset += 1; - Ok(v) } // --------------------------------------------------------------------------- -// Position storage serde (shared by PostingList variants and Positions) +// Position storage sections (shared by PostingList variants and Positions) // --------------------------------------------------------------------------- const POSITION_LIST_COLUMN: &str = "position_list"; @@ -131,33 +96,36 @@ const ROW_IDS_COLUMN: &str = "row_ids"; const FREQUENCIES_COLUMN: &str = "frequencies"; const BLOCKS_COLUMN: &str = "blocks"; -#[derive(Serialize, Deserialize)] -struct SharedPositionsHeader { - codec: u8, +fn legacy_positions_batch(list: &ListArray) -> Result { + let schema = Arc::new(Schema::new(vec![Field::new( + POSITION_LIST_COLUMN, + list.data_type().clone(), + list.is_nullable(), + )])); + Ok(RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?) +} + +fn read_legacy_positions(r: &mut CacheEntryReader<'_>) -> Result { + let batch = r.read_ipc()?; + Ok(batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? + .clone()) } -fn write_position_storage( - writer: &mut dyn Write, +/// Write the position sections (the bytes after the header) for `storage`. The +/// caller's header proto carries the storage kind and shared-stream codec. +fn write_position_sections( + w: &mut CacheEntryWriter<'_>, storage: &CompressedPositionStorage, ) -> Result<()> { match storage { CompressedPositionStorage::LegacyPerDoc(list) => { - write_u8(writer, POSITIONS_TAG_LEGACY)?; - let schema = Arc::new(Schema::new(vec![Field::new( - POSITION_LIST_COLUMN, - list.data_type().clone(), - list.is_nullable(), - )])); - let batch = RecordBatch::try_new(schema, vec![Arc::new(list.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&legacy_positions_batch(list)?)?; } CompressedPositionStorage::SharedStream(stream) => { - write_u8(writer, POSITIONS_TAG_SHARED)?; - let header = SharedPositionsHeader { - codec: position_stream_codec_to_u8(stream.codec()), - }; - write_json_header(writer, &header)?; - let offsets = UInt32Array::from(stream.block_offsets().to_vec()); let schema = Arc::new(Schema::new(vec![Field::new( BLOCK_OFFSETS_COLUMN, @@ -165,55 +133,42 @@ fn write_position_storage( false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(offsets)])?; - write_ipc_stream(&batch, writer)?; - - write_len_prefixed_bytes(writer, stream.bytes())?; + w.write_ipc(&batch)?; + w.write_raw(stream.bytes())?; } } Ok(()) } -fn read_position_storage( - data: &Bytes, - offset: &mut usize, - tag: u8, -) -> Result { - match tag { - POSITIONS_TAG_LEGACY => { - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - let list = batch - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| Error::io("legacy position column is not a ListArray".to_string()))? - .clone(); - Ok(CompressedPositionStorage::LegacyPerDoc(list)) - } - POSITIONS_TAG_SHARED => { - let header: SharedPositionsHeader = read_json_header(data, offset)?; - let codec = u8_to_position_stream_codec(header.codec)?; - - let batch = - read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; +/// Read the position sections for the given `storage` kind and (for shared +/// streams) `stream_codec`. Returns `None` only when `storage` is +/// [`PbPositionStorage::None`]. +fn read_position_sections( + r: &mut CacheEntryReader<'_>, + storage: PbPositionStorage, + stream_codec: PositionStreamCodec, +) -> Result> { + match storage { + PbPositionStorage::None => Ok(None), + PbPositionStorage::Legacy => Ok(Some(CompressedPositionStorage::LegacyPerDoc( + read_legacy_positions(r)?, + ))), + PbPositionStorage::Shared => { + let batch = r.read_ipc()?; let block_offsets = batch .column(0) .as_primitive_opt::() .ok_or_else(|| Error::io("block_offsets column is not UInt32".to_string()))? .values() .to_vec(); - - // Zero copy: read_len_prefixed_bytes_at returns a Bytes slice - // backed by the same allocation as `data`, and SharedPositionStream - // now stores its byte buffer as Bytes -- no copy on read. - let bytes = - read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - - Ok(CompressedPositionStorage::SharedStream( - SharedPositionStream::new(codec, block_offsets, bytes), - )) + // Zero copy: read_raw returns a Bytes slice backed by the same + // allocation as the input, and SharedPositionStream stores its byte + // buffer as Bytes -- no copy on read. + let bytes = r.read_raw()?; + Ok(Some(CompressedPositionStorage::SharedStream( + SharedPositionStream::new(stream_codec, block_offsets, bytes), + ))) } - other => Err(Error::io(format!("unknown positions tag: {other}"))), } } @@ -221,50 +176,45 @@ fn read_position_storage( // PostingList codec // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PlainPostingHeader { - max_score: Option, -} - -#[derive(Serialize, Deserialize)] -struct CompressedPostingHeader { - max_score: f32, - length: u32, - posting_tail_codec: u8, -} - impl CacheCodecImpl for PostingList { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingList"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { match self { Self::Plain(plain) => { - write_u8(writer, POSTING_VARIANT_PLAIN)?; - serialize_plain(writer, plain) + w.write_u8(POSTING_VARIANT_PLAIN)?; + serialize_plain(w, plain) } Self::Compressed(compressed) => { - write_u8(writer, POSTING_VARIANT_COMPRESSED)?; - serialize_compressed(writer, compressed) + w.write_u8(POSTING_VARIANT_COMPRESSED)?; + serialize_compressed(w, compressed) } } } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let variant = read_u8(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let variant = r.read_u8()?; match variant { - POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(data, &mut offset)?)), - POSTING_VARIANT_COMPRESSED => { - Ok(Self::Compressed(deserialize_compressed(data, &mut offset)?)) - } + POSTING_VARIANT_PLAIN => Ok(Self::Plain(deserialize_plain(r)?)), + POSTING_VARIANT_COMPRESSED => Ok(Self::Compressed(deserialize_compressed(r)?)), other => Err(Error::io(format!("unknown PostingList variant: {other}"))), } } } -fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<()> { +fn serialize_plain(w: &mut CacheEntryWriter<'_>, plain: &PlainPostingList) -> Result<()> { + // Plain postings carry only per-doc legacy positions (or none). + let position_storage = if plain.positions.is_some() { + PbPositionStorage::Legacy + } else { + PbPositionStorage::None + }; let header = PlainPostingHeader { max_score: plain.max_score, + position_storage: position_storage as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let row_ids = UInt64Array::new(plain.row_ids.clone(), None); let frequencies = Float32Array::new(plain.frequencies.clone(), None); @@ -273,26 +223,18 @@ fn serialize_plain(writer: &mut dyn Write, plain: &PlainPostingList) -> Result<( Field::new(FREQUENCIES_COLUMN, DataType::Float32, false), ])); let batch = RecordBatch::try_new(schema, vec![Arc::new(row_ids), Arc::new(frequencies)])?; - write_ipc_stream(&batch, writer)?; - - match &plain.positions { - Some(list) => { - // Plain postings can only carry per-doc legacy positions; reuse - // the shared encoder. - write_position_storage( - writer, - &CompressedPositionStorage::LegacyPerDoc(list.clone()), - )?; - } - None => write_u8(writer, POSITIONS_TAG_NONE)?, + w.write_ipc(&batch)?; + + if let Some(list) = &plain.positions { + w.write_ipc(&legacy_positions_batch(list)?)?; } Ok(()) } -fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result { - let header: PlainPostingHeader = read_json_header(data, offset)?; +fn deserialize_plain(r: &mut CacheEntryReader<'_>) -> Result { + let header: PlainPostingHeader = r.read_header()?; - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let row_ids = batch .column(0) .as_primitive_opt::() @@ -306,19 +248,13 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result None, - POSITIONS_TAG_LEGACY => match read_position_storage(data, offset, positions_tag)? { - CompressedPositionStorage::LegacyPerDoc(list) => Some(list), - CompressedPositionStorage::SharedStream(_) => { - unreachable!("shared stream tag was read as legacy variant (this is a bug)") - } - }, - other => { - return Err(Error::io(format!( - "Plain posting list cannot have positions tag {other}" - ))); + let positions = match header.position_storage() { + PbPositionStorage::None => None, + PbPositionStorage::Legacy => Some(read_legacy_positions(r)?), + PbPositionStorage::Shared => { + return Err(Error::io( + "Plain posting list cannot have a shared position stream".to_string(), + )); } }; @@ -330,13 +266,33 @@ fn deserialize_plain(data: &Bytes, offset: &mut usize) -> Result Result<()> { +/// The compressed posting list is serialized with a protobuf header followed +/// by 64-byte-aligned Arrow IPC sections (for the `blocks`, and for shared +/// position block-offsets) and a raw blob (for the shared position byte +/// stream, which already has its own portable encoding). +fn serialize_compressed( + w: &mut CacheEntryWriter<'_>, + posting: &CompressedPostingList, +) -> Result<()> { + let (position_storage, position_stream_codec) = match &posting.positions { + None => (PbPositionStorage::None, PbPositionStreamCodec::default()), + Some(CompressedPositionStorage::LegacyPerDoc(_)) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + Some(CompressedPositionStorage::SharedStream(stream)) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = CompressedPostingHeader { max_score: posting.max_score, length: posting.length, - posting_tail_codec: posting_tail_codec_to_u8(posting.posting_tail_codec), + posting_tail_codec: posting_tail_codec_to_proto(posting.posting_tail_codec) as i32, + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, }; - write_json_header(writer, &header)?; + w.write_header(&header)?; let schema = Arc::new(Schema::new(vec![Field::new( BLOCKS_COLUMN, @@ -344,20 +300,19 @@ fn serialize_compressed(writer: &mut dyn Write, posting: &CompressedPostingList) false, )])); let batch = RecordBatch::try_new(schema, vec![Arc::new(posting.blocks.clone())])?; - write_ipc_stream(&batch, writer)?; + w.write_ipc(&batch)?; - match &posting.positions { - Some(storage) => write_position_storage(writer, storage)?, - None => write_u8(writer, POSITIONS_TAG_NONE)?, + if let Some(storage) = &posting.positions { + write_position_sections(w, storage)?; } Ok(()) } -fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result { - let header: CompressedPostingHeader = read_json_header(data, offset)?; - let posting_tail_codec = u8_to_posting_tail_codec(header.posting_tail_codec)?; +fn deserialize_compressed(r: &mut CacheEntryReader<'_>) -> Result { + let header: CompressedPostingHeader = r.read_header()?; + let posting_tail_codec = proto_to_posting_tail_codec(header.posting_tail_codec()); - let batch = read_ipc_stream_single_at(data, offset).map_err(|e| Error::io(e.to_string()))?; + let batch = r.read_ipc()?; let blocks = batch .column(0) .as_any() @@ -365,12 +320,8 @@ fn deserialize_compressed(data: &Bytes, offset: &mut usize) -> Result Result Result<()> { + const TYPE_ID: &'static str = "lance.fts.PostingListGroup"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let count = u32::try_from(self.posting_lists.len()) .map_err(|_| Error::io("posting list group too large to serialize".to_string()))?; - writer - .write_all(&count.to_le_bytes()) - .map_err(|e| Error::io(format!("failed to write group count: {e}")))?; + w.write_header(&PostingListGroupHeader { count })?; for posting in &self.posting_lists { - let mut buf = Vec::new(); - posting.serialize(&mut buf)?; - write_len_prefixed_bytes(writer, &buf)?; + posting.serialize(w)?; } Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - if data.len() < 4 { - return Err(Error::io( - "truncated posting list group: missing count".to_string(), - )); - } - let count = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize; - offset += 4; - let mut posting_lists = Vec::with_capacity(count); - for _ in 0..count { - let entry = read_len_prefixed_bytes_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; - posting_lists.push(PostingList::deserialize(&entry)?); + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PostingListGroupHeader = r.read_header()?; + let mut posting_lists = Vec::with_capacity(header.count as usize); + for _ in 0..header.count { + posting_lists.push(PostingList::deserialize(r)?); } Ok(Self::new(posting_lists)) } @@ -428,20 +371,35 @@ impl CacheCodecImpl for PostingListGroup { // --------------------------------------------------------------------------- impl CacheCodecImpl for Positions { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - write_position_storage(writer, &self.0) + const TYPE_ID: &'static str = "lance.fts.Positions"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let (position_storage, position_stream_codec) = match &self.0 { + CompressedPositionStorage::LegacyPerDoc(_) => { + (PbPositionStorage::Legacy, PbPositionStreamCodec::default()) + } + CompressedPositionStorage::SharedStream(stream) => ( + PbPositionStorage::Shared, + position_stream_codec_to_proto(stream.codec()), + ), + }; + let header = PositionsHeader { + position_storage: position_storage as i32, + position_stream_codec: position_stream_codec as i32, + }; + w.write_header(&header)?; + write_position_sections(w, &self.0) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let tag = read_u8(data, &mut offset)?; - if tag == POSITIONS_TAG_NONE { - return Err(Error::io( - "Positions cache entry cannot encode the None variant".to_string(), - )); - } - let storage = read_position_storage(data, &mut offset, tag)?; - Ok(Self(storage)) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PositionsHeader = r.read_header()?; + let stream_codec = proto_to_position_stream_codec(header.position_stream_codec()); + read_position_sections(r, header.position_storage(), stream_codec)? + .map(Self) + .ok_or_else(|| { + Error::io("Positions cache entry cannot encode the None variant".to_string()) + }) } } @@ -455,7 +413,8 @@ mod tests { use arrow_array::LargeBinaryArray; use arrow_array::builder::{Int32Builder, ListBuilder}; use bytes::Bytes; - use lance_core::cache::CacheCodecImpl; + use lance_core::Result; + use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use super::super::index::{ CompressedPositionStorage, CompressedPostingList, PlainPostingList, PositionStreamCodec, @@ -502,16 +461,26 @@ mod tests { } } - fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + /// Serialize a codec body (no envelope) into a standalone buffer. + fn body_bytes(entry: &T) -> Bytes { let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - PostingList::deserialize(&Bytes::from(buf)).unwrap() + let mut w = CacheEntryWriter::new(&mut buf); + entry.serialize(&mut w).unwrap(); + Bytes::from(buf) + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn from_body(data: &Bytes) -> Result { + let mut r = CacheEntryReader::new(data, 0, T::CURRENT_VERSION); + T::deserialize(&mut r) + } + + fn roundtrip_posting_list(entry: &PostingList) -> PostingList { + from_body::(&body_bytes(entry)).unwrap() } fn roundtrip_positions(entry: &Positions) -> Positions { - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); - Positions::deserialize(&Bytes::from(buf)).unwrap() + from_body::(&body_bytes(entry)).unwrap() } fn assert_slice_points_into_bytes(slice: &[u8], bytes: &Bytes) { @@ -652,13 +621,9 @@ mod tests { expected_stream.clone(), )), ); - let mut buf = Vec::new(); - PostingList::Compressed(posting) - .serialize(&mut buf) - .unwrap(); - let serialized = Bytes::from(buf); + let serialized = body_bytes(&PostingList::Compressed(posting)); - let restored = PostingList::deserialize(&serialized).unwrap(); + let restored = from_body::(&serialized).unwrap(); let PostingList::Compressed(restored) = restored else { panic!("expected Compressed variant"); }; @@ -695,9 +660,7 @@ mod tests { vec![plain.clone(), compressed, plain], ] { let group = PostingListGroup::new(members.clone()); - let mut buf = Vec::new(); - group.serialize(&mut buf).unwrap(); - let restored = PostingListGroup::deserialize(&Bytes::from(buf)).unwrap(); + let restored = from_body::(&body_bytes(&group)).unwrap(); assert_eq!(restored.posting_lists.len(), members.len()); for (a, b) in members.iter().zip(restored.posting_lists.iter()) { match (a, b) { @@ -743,9 +706,241 @@ mod tests { None, ); let entry = PostingList::Plain(plain); - let mut buf = Vec::new(); - entry.serialize(&mut buf).unwrap(); + let mut buf = body_bytes(&entry).to_vec(); buf.truncate(buf.len() / 2); - assert!(PostingList::deserialize(&Bytes::from(buf)).is_err()); + assert!(from_body::(&Bytes::from(buf)).is_err()); + } + + /// Tests covering the stabilized envelope + compressed proto format, + /// exercised through the full type-erased [`CacheCodec`] (envelope + body). + mod stable_format { + use std::sync::Arc; + + use arrow_array::Array; + use lance_core::cache::CacheCodec; + use prost::Message; + + use super::*; + use crate::cache_pb::{CompressedPostingHeader, PostingTailCodec as PbPostingTailCodec}; + + type ArcAny = Arc; + + fn codec() -> CacheCodec { + CacheCodec::from_impl::() + } + + /// Serialize an entry through the full codec (envelope + body). + fn serialize_entry(entry: PostingList) -> Vec { + let any: ArcAny = Arc::new(entry); + let mut buf = Vec::new(); + codec().serialize(&any, &mut buf).unwrap(); + buf + } + + /// A `Bytes` whose base address is 64-byte aligned, modelling a backend + /// that reads cache entries into an aligned buffer. + fn aligned_bytes(payload: &[u8]) -> Bytes { + const ALIGN: usize = 64; + let mut v = vec![0u8; payload.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + payload.len()].copy_from_slice(payload); + Bytes::from(v).slice(pad..pad + payload.len()) + } + + fn compressed_with_shared_positions() -> PostingList { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[9u8; 48][..]), Some(&[1u8; 48])]); + let stream = SharedPositionStream::new( + PositionStreamCodec::PackedDelta, + vec![0u32, 4, 11], + Bytes::from((0u8..64).collect::>()), + ); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + Some(CompressedPositionStorage::SharedStream(stream)), + )) + } + + /// The compressed `blocks` (an aligned IPC section) and the shared + /// position blob (a raw section) must both be borrowed zero-copy from + /// the input even though the envelope pushes them to a non-zero, + /// non-aligned starting offset. + #[test] + fn compressed_sections_are_zero_copy_through_envelope() { + let serialized = aligned_bytes(&serialize_entry(compressed_with_shared_positions())); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Compressed(restored) = restored.as_ref() else { + panic!("expected Compressed"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + // blocks IPC section decoded in place (no realigning memcpy). + for buf in restored.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "blocks buffer was realigned out of the input — misaligned IPC section", + ); + } + // shared position raw blob borrowed in place. + let Some(CompressedPositionStorage::SharedStream(stream)) = &restored.positions else { + panic!("expected shared stream"); + }; + assert!(points_in(stream.bytes().as_ptr() as usize)); + } + + /// Every member of a `PostingListGroup` must also decode zero-copy. The + /// group writes its members inline so each member's IPC sections stay + /// 64-byte aligned within the entry; embedding members in per-member + /// sub-buffers would land them at arbitrary offsets and force a + /// realigning memcpy on load. + #[test] + fn group_member_sections_are_zero_copy_through_envelope() { + let make_member = |fill: u8| { + let blocks = + LargeBinaryArray::from_opt_vec(vec![Some(&[fill; 48][..]), Some(&[fill; 48])]); + PostingList::Compressed(CompressedPostingList::new( + blocks, + 7.0, + 3, + PostingTailCodec::VarintDelta, + None, + )) + }; + let group = PostingListGroup::new(vec![make_member(9), make_member(1)]); + + let group_codec = CacheCodec::from_impl::(); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + group_codec.serialize(&any, &mut buf).unwrap(); + let serialized = aligned_bytes(&buf); + + let restored = group_codec.deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + let points_in = |ptr: usize| ptr >= base && ptr < end; + + assert_eq!(restored.posting_lists.len(), 2); + for member in &restored.posting_lists { + let PostingList::Compressed(member) = member else { + panic!("expected Compressed member"); + }; + for buf in member.blocks.to_data().buffers() { + assert!( + points_in(buf.as_ptr() as usize), + "group member blocks buffer was realigned out of the input — \ + misaligned IPC section", + ); + } + } + } + + /// The plain posting's row-id/frequency IPC section must also decode + /// zero-copy through the envelope + proto header. + #[test] + fn plain_sections_are_zero_copy_through_envelope() { + let plain = PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from((0u64..64).collect::>()), + ScalarBuffer::from(vec![1.0f32; 64]), + Some(2.0), + None, + )); + let serialized = aligned_bytes(&serialize_entry(plain)); + let restored = codec().deserialize(&serialized).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + let PostingList::Plain(restored) = restored.as_ref() else { + panic!("expected Plain"); + }; + + let base = serialized.as_ptr() as usize; + let end = base + serialized.len(); + // The row_ids ScalarBuffer must borrow from the input allocation. + let ptr = restored.row_ids.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "row_ids buffer was realigned out of the input — misaligned IPC section", + ); + } + + /// Additive proto fields (lever #1) must not break decoding: an unknown + /// field number appended to the header is ignored. + #[test] + fn header_proto_ignores_unknown_fields() { + let header = CompressedPostingHeader { + max_score: 1.5, + length: 9, + posting_tail_codec: PbPostingTailCodec::VarintDelta as i32, + ..Default::default() + }; + let mut bytes = header.encode_to_vec(); + // Append an unknown field #15, varint wire type (0), value 7. + bytes.push(15 << 3); + bytes.push(7); + let decoded = CompressedPostingHeader::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded.length, 9); + assert_eq!(decoded.max_score, 1.5); + } + + /// An entry written by a different codec (foreign TYPE_ID) misses. + #[test] + fn foreign_type_id_is_miss() { + // A PostingListGroup entry carries a different TYPE_ID in its + // envelope; reading it as a PostingList must miss, not misread it. + let group = PostingListGroup::new(vec![]); + let any: ArcAny = Arc::new(group); + let mut buf = Vec::new(); + CacheCodec::from_impl::() + .serialize(&any, &mut buf) + .unwrap(); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// An entry written by a newer build (higher type_version) misses. + #[test] + fn future_type_version_is_miss() { + let mut buf = serialize_entry(compressed_with_shared_positions()); + // Patch the envelope's type_version (magic[4] + ver[1] + len[2] + + // type_id[N]) to a value beyond what this build understands. + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let version_off = 4 + 1 + 2 + type_id_len; + buf[version_off..version_off + 4].copy_from_slice(&u32::MAX.to_le_bytes()); + assert!(codec().deserialize(&Bytes::from(buf)).hit().is_none()); + } + + /// A pre-stabilization blob (no magic) self-heals to a miss. + #[test] + fn pre_stabilization_blob_is_miss() { + // Old format led with a u64 LE length prefix, never our magic. + let mut blob = (30u64).to_le_bytes().to_vec(); + blob.extend_from_slice(&[0u8; 30]); + assert!(codec().deserialize(&Bytes::from(blob)).hit().is_none()); + } + + /// A structurally-valid envelope whose body leads with an out-of-range + /// variant tag self-heals to a `BodyError` miss rather than panicking or + /// misreading the remaining bytes. + #[test] + fn unknown_posting_variant_is_miss() { + use lance_core::cache::{CacheDecode, CacheMissReason}; + + let mut buf = serialize_entry(compressed_with_shared_positions()); + // The variant tag is the first body byte, right after the envelope + // (magic[4] + ver[1] + type_id_len[2] + type_id[N] + type_version[4]). + let type_id_len = u16::from_le_bytes([buf[5], buf[6]]) as usize; + let variant_off = 4 + 1 + 2 + type_id_len + 4; + buf[variant_off] = 2; // neither PLAIN (0) nor COMPRESSED (1) + match codec().deserialize(&Bytes::from(buf)) { + CacheDecode::Miss(reason) => assert_eq!(reason, CacheMissReason::BodyError), + CacheDecode::Hit(_) => panic!("expected a BodyError miss, got a hit"), + } + } } } diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index cf357d89585..8e07a607bff 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -18,8 +18,9 @@ use datafusion::execution::RecordBatchStream; use datafusion::physical_plan::{SendableRecordBatchStream, stream::RecordBatchStreamAdapter}; use datafusion_common::ScalarValue; use futures::{StreamExt, TryStream, TryStreamExt, stream::BoxStream}; -use lance_arrow::ipc::{read_len_prefixed_bytes_at, write_len_prefixed_bytes}; -use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache}; +use lance_core::cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, +}; use lance_core::deepsize::DeepSizeOf; use lance_core::error::LanceOptionExt; use lance_core::{Error, ROW_ID, Result}; @@ -532,27 +533,30 @@ impl LabelListIndexState { } impl CacheCodecImpl for LabelListIndexState { + const TYPE_ID: &'static str = "lance.scalar.LabelListIndexState"; + const CURRENT_VERSION: u32 = 1; + /// Wire format: /// ```text - /// [u64 list_nulls_len][list_nulls bytes] - /// [bitmap state bytes (self-delimiting)] + /// RAW_BLOB : list_nulls (roaring tree map, portable encoding) + /// /// ``` - fn serialize(&self, writer: &mut dyn std::io::Write) -> Result<()> { + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let mut nulls_bytes = Vec::with_capacity(self.list_nulls.serialized_size()); self.list_nulls.serialize_into(&mut nulls_bytes)?; - write_len_prefixed_bytes(writer, &nulls_bytes)?; - self.bitmap_state.serialize(writer)?; + w.write_raw(&nulls_bytes)?; + // The bitmap state writes its own self-delimiting body inline. + self.bitmap_state.serialize(w)?; Ok(()) } - fn deserialize(data: &bytes::Bytes) -> Result { - let mut offset = 0; - let nulls_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let nulls_bytes = r.read_raw()?; let list_nulls = Arc::new(RowAddrTreeMap::deserialize_from(nulls_bytes.as_ref())?); // The bitmap state is self-delimiting (length-prefixed null map + - // Arrow IPC stream with EOS marker), so we can hand the remaining - // tail to it directly. - let bitmap_state = BitmapIndexState::deserialize(&data.slice(offset..))?; + // Arrow IPC stream with EOS marker); it continues reading the body + // from where the null map left off. + let bitmap_state = BitmapIndexState::deserialize(r)?; Ok(Self { bitmap_state, list_nulls, @@ -728,3 +732,91 @@ impl ScalarIndexPlugin for LabelListIndexPlugin { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use datafusion_common::ScalarValue; + use lance_core::cache::CacheCodec; + use lance_core::utils::address::RowAddress; + + use super::super::bitmap::BitmapIndexState; + use super::super::btree::OrderableScalarValue; + use super::*; + + fn sample_state() -> LabelListIndexState { + let mut index_map = BTreeMap::new(); + for k in 0..32i32 { + index_map.insert( + OrderableScalarValue(ScalarValue::Int32(Some(k))), + k as usize, + ); + } + let mut bitmap_nulls = RowAddrTreeMap::new(); + bitmap_nulls.insert(RowAddress::new_from_parts(0, 3).into()); + let bitmap_state = + BitmapIndexState::new_for_test(index_map, bitmap_nulls, DataType::Int32).unwrap(); + + let mut list_nulls = RowAddrTreeMap::new(); + list_nulls.insert(RowAddress::new_from_parts(0, 9).into()); + LabelListIndexState { + bitmap_state, + list_nulls: Arc::new(list_nulls), + } + } + + #[test] + fn test_label_list_state_codec_roundtrip() { + let state = sample_state(); + let mut buf = Vec::new(); + state + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + let data = Bytes::from(buf); + let mut reader = CacheEntryReader::new(&data, 0, LabelListIndexState::CURRENT_VERSION); + let restored = LabelListIndexState::deserialize(&mut reader).unwrap(); + + assert_eq!(&*restored.list_nulls, &*state.list_nulls); + assert_eq!( + restored.bitmap_state.lookup_batch(), + state.bitmap_state.lookup_batch() + ); + assert_eq!( + restored.bitmap_state.null_map(), + state.bitmap_state.null_map() + ); + } + + /// The nested bitmap lookup batch must decode zero-copy through the full + /// envelope, proving the leading `list_nulls` RAW_BLOB does not knock the + /// nested IPC section off its 64-byte boundary. + #[test] + fn test_label_list_nested_lookup_is_zero_copy() { + const ALIGN: usize = 64; + let codec = CacheCodec::from_impl::(); + let any: Arc = Arc::new(sample_state()); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored.downcast::().unwrap(); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + for col in restored.bitmap_state.lookup_batch().columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "nested bitmap lookup buffer was realigned — misaligned IPC section", + ); + } + } + } +} diff --git a/rust/lance-select/src/mask.rs b/rust/lance-select/src/mask.rs index b76e0de9a2b..f9df7720441 100644 --- a/rust/lance-select/src/mask.rs +++ b/rust/lance-select/src/mask.rs @@ -13,7 +13,7 @@ use itertools::Itertools; use lance_core::deepsize::DeepSizeOf; use roaring::{MultiOps, RoaringBitmap, RoaringTreemap}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::utils::address::RowAddress; use lance_core::{Error, Result}; @@ -692,12 +692,17 @@ impl RowAddrTreeMap { } impl CacheCodecImpl for RowAddrTreeMap { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - self.serialize_into(writer) + const TYPE_ID: &'static str = "lance.RowAddrTreeMap"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + // A roaring bitmap has its own stable, portable serialization; it is + // the whole body, so write it raw rather than length-prefixed. + self.serialize_into(w.raw_writer()) } - fn deserialize(data: &bytes::Bytes) -> Result { - Self::deserialize_from(data.as_ref()) + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + Self::deserialize_from(r.body().as_ref()) } } diff --git a/rust/lance-table/src/format/index.rs b/rust/lance-table/src/format/index.rs index 33ee464fe76..f603536a3eb 100644 --- a/rust/lance-table/src/format/index.rs +++ b/rust/lance-table/src/format/index.rs @@ -15,6 +15,7 @@ use roaring::RoaringBitmap; use uuid::Uuid; use super::pb; +use lance_core::cache::{CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; /// Metadata about a single file within an index segment. @@ -235,24 +236,26 @@ impl From<&IndexMetadata> for pb::IndexMetadata { /// orphan rule prevents `impl CacheCodecImpl for Vec`. type ArcAny = Arc; +/// Stable type identifier for the `Vec` cache entry. +const INDEX_METADATA_TYPE_ID: &str = "lance.table.IndexMetadataList"; +/// Body schema version written by this build. +const INDEX_METADATA_VERSION: u32 = 1; + fn serialize_index_metadata( any: &ArcAny, - writer: &mut dyn std::io::Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> { - use prost::Message; let vec = any .downcast_ref::>() .expect("index_metadata_codec: wrong type (this is a bug in the cache layer)"); let section = pb::IndexSection { indices: vec.iter().map(pb::IndexMetadata::from).collect(), }; - writer.write_all(§ion.encode_to_vec())?; - Ok(()) + writer.write_header(§ion) } -fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result { - use prost::Message; - let section = pb::IndexSection::decode(data.as_ref())?; +fn deserialize_index_metadata(reader: &mut CacheEntryReader<'_>) -> lance_core::Result { + let section: pb::IndexSection = reader.read_header()?; let indices: Vec = section .indices .into_iter() @@ -262,7 +265,12 @@ fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result } pub fn index_metadata_codec() -> lance_core::cache::CacheCodec { - lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata) + lance_core::cache::CacheCodec::new( + INDEX_METADATA_TYPE_ID, + INDEX_METADATA_VERSION, + serialize_index_metadata, + deserialize_index_metadata, + ) } /// List all files in an index directory with their sizes. @@ -348,7 +356,8 @@ mod tests { let bytes = store.get(&key).unwrap(); let recovered = codec .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .unwrap(); + .hit() + .expect("entry should decode as a hit"); let recovered = recovered .downcast::>() .expect("downcast should succeed"); diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index beb6e2b99fd..d5c4493c8a8 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -2078,11 +2078,7 @@ mod fts_serializing_backend { ) -> Option { let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes.clone()) - .expect("deserialization should succeed"), - ); + return stored_codec.deserialize(&bytes.clone()).hit(); } drop(guard); self.passthrough.get(key, codec).await diff --git a/rust/lance/src/index/vector/ivf/partition_serde.rs b/rust/lance/src/index/vector/ivf/partition_serde.rs index 83ced18c598..ad737620a94 100644 --- a/rust/lance/src/index/vector/ivf/partition_serde.rs +++ b/rust/lance/src/index/vector/ivf/partition_serde.rs @@ -3,32 +3,17 @@ //! Serialization and zero-copy deserialization for IVF partition cache entries. //! -//! The format is: -//! -//! ```text -//! [header_len: u64 LE] -//! [header: JSON bytes] -//! [sub_index Arrow IPC stream] -//! [... quantizer-specific IPC streams ...] -//! [storage Arrow IPC stream] -//! ``` -//! -//! Each IPC section is a self-delimiting Arrow IPC stream (schema + batches + EOS -//! marker), written directly to the underlying writer without buffering. On -//! deserialization, each message is read into a per-message buffer and zero-copy -//! decoded via [`lance_arrow::ipc`]. +//! Each entry is a protobuf header (see `lance-index/protos-cache/cache.proto`, with the +//! distance and rotation types as proto enums) followed by 64-byte-aligned +//! Arrow IPC sections in a fixed, version-keyed order: the sub-index, then any +//! quantizer-specific arrays (PQ codebook, RabitQ Matrix rotation), then the +//! quantizer storage batches. Sections decode zero-copy via [`lance_arrow::ipc`]. -use std::io::Write; use std::sync::Arc; use arrow_array::{FixedSizeListArray, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; -use bytes::Bytes; -use lance_arrow::ipc::{ - read_ipc_stream_at, read_ipc_stream_single_at, read_len_prefixed_bytes_at, write_ipc_stream, - write_ipc_stream_batches, write_len_prefixed_bytes, -}; -use lance_core::cache::CacheCodecImpl; +use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; use lance_core::{Error, Result}; use lance_index::vector::bq::RQRotationType; use lance_index::vector::bq::builder::RabitQuantizer; @@ -38,11 +23,15 @@ use lance_index::vector::pq::ProductQuantizer; use lance_index::vector::pq::storage::ProductQuantizationMetadata; use lance_index::vector::quantizer::{Quantization, QuantizerStorage}; use lance_index::vector::sq::ScalarQuantizer; -use lance_index::vector::sq::storage::ScalarQuantizationMetadata; use lance_index::vector::storage::VectorStore; use lance_index::vector::v3::subindex::IvfSubIndex; use lance_linalg::distance::DistanceType; -use serde::{Deserialize, Serialize}; + +use lance_index::cache_pb::{ + DistanceType as PbDistanceType, FlatPartitionHeader, PqPartitionHeader, RabitPartitionHeader, + RabitQueryEstimator as PbRabitQueryEstimator, RotationType as PbRotationType, + SqPartitionHeader, +}; use super::v2::PartitionEntry; @@ -68,7 +57,7 @@ type ArcAny = Arc; fn serialize_partition_entry( any: &ArcAny, - writer: &mut dyn Write, + writer: &mut CacheEntryWriter<'_>, ) -> lance_core::Result<()> where S: IvfSubIndex + 'static, @@ -81,14 +70,16 @@ where concrete.serialize(writer) } -fn deserialize_partition_entry(data: &Bytes) -> lance_core::Result +fn deserialize_partition_entry( + reader: &mut CacheEntryReader<'_>, +) -> lance_core::Result where S: IvfSubIndex + 'static, Q: Quantization + 'static, Concrete: Quantization + 'static, PartitionEntry: CacheCodecImpl, { - let concrete = PartitionEntry::::deserialize(data)?; + let concrete = PartitionEntry::::deserialize(reader)?; let any: ArcAny = Arc::new(concrete); Ok(any .downcast::>() @@ -109,6 +100,8 @@ where PartitionEntry: CacheCodecImpl, { lance_core::cache::CacheCodec::new( + as CacheCodecImpl>::TYPE_ID, + as CacheCodecImpl>::CURRENT_VERSION, serialize_partition_entry::, deserialize_partition_entry::, ) @@ -118,51 +111,64 @@ where // Common helpers // --------------------------------------------------------------------------- -fn distance_type_to_u8(dt: DistanceType) -> u8 { +// Distance and rotation discriminants travel as proto enums in the header; +// these map to/from the in-memory Rust enums. + +fn distance_type_to_proto(dt: DistanceType) -> PbDistanceType { + match dt { + DistanceType::L2 => PbDistanceType::L2, + DistanceType::Cosine => PbDistanceType::Cosine, + DistanceType::Dot => PbDistanceType::Dot, + DistanceType::Hamming => PbDistanceType::Hamming, + } +} + +fn proto_to_distance_type(dt: PbDistanceType) -> DistanceType { match dt { - DistanceType::L2 => 0, - DistanceType::Cosine => 1, - DistanceType::Dot => 2, - DistanceType::Hamming => 3, + PbDistanceType::L2 => DistanceType::L2, + PbDistanceType::Cosine => DistanceType::Cosine, + PbDistanceType::Dot => DistanceType::Dot, + PbDistanceType::Hamming => DistanceType::Hamming, } } -fn u8_to_distance_type(v: u8) -> Result { - match v { - 0 => Ok(DistanceType::L2), - 1 => Ok(DistanceType::Cosine), - 2 => Ok(DistanceType::Dot), - 3 => Ok(DistanceType::Hamming), - _ => Err(Error::io(format!("unknown distance type: {v}"))), +fn rotation_type_to_proto(rt: RQRotationType) -> PbRotationType { + match rt { + RQRotationType::Matrix => PbRotationType::Matrix, + RQRotationType::Fast => PbRotationType::Fast, } } -fn rotation_type_to_u8(rt: RQRotationType) -> u8 { +fn proto_to_rotation_type(rt: PbRotationType) -> RQRotationType { match rt { - RQRotationType::Matrix => 0, - RQRotationType::Fast => 1, + PbRotationType::Matrix => RQRotationType::Matrix, + PbRotationType::Fast => RQRotationType::Fast, } } -fn u8_to_rotation_type(v: u8) -> Result { - match v { - 0 => Ok(RQRotationType::Matrix), - 1 => Ok(RQRotationType::Fast), - _ => Err(Error::io(format!("unknown rotation type: {v}"))), +fn query_estimator_to_proto(qe: RabitQueryEstimator) -> PbRabitQueryEstimator { + match qe { + RabitQueryEstimator::ResidualQuery => PbRabitQueryEstimator::ResidualQuery, + RabitQueryEstimator::RawQuery => PbRabitQueryEstimator::RawQuery, } } -/// Write a JSON-serializable header using [`write_len_prefixed_bytes`]. -fn write_json_header(writer: &mut dyn Write, header: &impl Serialize) -> Result<()> { - let header_json = serde_json::to_vec(header)?; - write_len_prefixed_bytes(writer, &header_json)?; - Ok(()) +fn proto_to_query_estimator(qe: PbRabitQueryEstimator) -> RabitQueryEstimator { + match qe { + PbRabitQueryEstimator::ResidualQuery => RabitQueryEstimator::ResidualQuery, + PbRabitQueryEstimator::RawQuery => RabitQueryEstimator::RawQuery, + } } -/// Read a JSON header written by [`write_json_header`]. -fn read_json_header(data: &Bytes, offset: &mut usize) -> Result { - let bytes = read_len_prefixed_bytes_at(data, offset).map_err(|e| Error::io(e.to_string()))?; - serde_json::from_slice(&bytes).map_err(|e| Error::io(e.to_string())) +/// Read a storage section expected to hold exactly one batch. +fn read_single_storage_batch(r: &mut CacheEntryReader<'_>) -> Result { + let mut batches = r.read_ipc_batches()?; + match batches.len() { + 1 => Ok(batches.remove(0)), + n => Err(Error::io(format!( + "expected exactly 1 storage batch, got {n}" + ))), + } } /// Wrap a `FixedSizeListArray` in a single-column `RecordBatch` with the given @@ -202,17 +208,11 @@ fn batch_to_codebook(batch: &RecordBatch) -> Result { // PQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct PqPartitionHeader { - distance_type: u8, - nbits: u32, - num_sub_vectors: usize, - dimension: usize, - transposed: bool, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.PQ"; + const CURRENT_VERSION: u32 = 1; + + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let metadata = self.storage.metadata(); let distance_type = self.storage.distance_type(); @@ -221,32 +221,28 @@ impl CacheCodecImpl for PartitionEntry { })?; let header = PqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), + distance_type: distance_type_to_proto(distance_type) as i32, nbits: metadata.nbits, - num_sub_vectors: metadata.num_sub_vectors, - dimension: metadata.dimension, + num_sub_vectors: metadata.num_sub_vectors as u64, + dimension: metadata.dimension as u64, transposed: metadata.transposed, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream(&codebook_to_batch(codebook)?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc(&codebook_to_batch(codebook)?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: PqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: PqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let codebook_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let codebook_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; let codebook = batch_to_codebook(&codebook_batch)?; @@ -254,8 +250,8 @@ impl CacheCodecImpl for PartitionEntry { let metadata = ProductQuantizationMetadata { codebook_position: 0, nbits: header.nbits, - num_sub_vectors: header.num_sub_vectors, - dimension: header.dimension, + num_sub_vectors: header.num_sub_vectors as usize, + dimension: header.dimension as usize, codebook: Some(codebook), codebook_tensor: Vec::new(), transposed: header.transposed, @@ -276,41 +272,35 @@ impl CacheCodecImpl for PartitionEntry { // Flat (Float32) // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct FlatPartitionHeader { - distance_type: u8, - dim: usize, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Flat"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -327,34 +317,34 @@ impl CacheCodecImpl for PartitionEntry { // --------------------------------------------------------------------------- impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.FlatBin"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = FlatPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + dim: metadata.dim as u64, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: FlatPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: FlatPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; - let metadata = FlatMetadata { dim: header.dim }; + let metadata = FlatMetadata { + dim: header.dim as usize, + }; let storage = ::Storage::try_from_batch( storage_batch, &metadata, @@ -370,56 +360,41 @@ impl CacheCodecImpl for PartitionEntry { // SQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct SqPartitionHeader { - distance_type: u8, - num_bits: u16, - dim: usize, - bounds_start: f64, - bounds_end: f64, -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.SQ"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = SqPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, - dim: metadata.dim, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, + dim: metadata.dim as u64, bounds_start: metadata.bounds.start, bounds_end: metadata.bounds.end, }; - write_json_header(writer, &header)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - // SQ storage may contain multiple batches; stream them all in one IPC stream. - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; + // SQ storage may contain multiple batches; write them all in one section. + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: SqPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: SqPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; - let storage_batches = - read_ipc_stream_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; + let storage_batches = r.read_ipc_batches()?; let index = S::load(sub_index_batch)?; - let metadata = ScalarQuantizationMetadata { - dim: header.dim, - num_bits: header.num_bits, - bounds: header.bounds_start..header.bounds_end, - }; + let num_bits = header.num_bits as u16; let storage = ::Storage::try_new( - metadata.num_bits, + num_bits, distance_type, - metadata.bounds, + header.bounds_start..header.bounds_end, storage_batches, None, )?; @@ -432,88 +407,69 @@ impl CacheCodecImpl for PartitionEntry { // RabitQ // --------------------------------------------------------------------------- -#[derive(Serialize, Deserialize)] -struct RabitPartitionHeader { - distance_type: u8, - num_bits: u8, - code_dim: u32, - #[serde(default = "default_rabit_query_estimator")] - query_estimator: RabitQueryEstimator, - /// 0 = Matrix, 1 = Fast - rotation_type: u8, - /// Fast rotation signs (only set when rotation_type == Fast). - fast_rotation_signs: Option>, -} - -fn default_rabit_query_estimator() -> RabitQueryEstimator { - RabitQueryEstimator::ResidualQuery -} - impl CacheCodecImpl for PartitionEntry { - fn serialize(&self, writer: &mut dyn Write) -> Result<()> { - let metadata = self.storage.metadata(); - let distance_type = self.storage.distance_type(); + const TYPE_ID: &'static str = "lance.vector.ivf.PartitionEntry.Rabit"; + const CURRENT_VERSION: u32 = 1; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + let metadata = self.storage.metadata(); let header = RabitPartitionHeader { - distance_type: distance_type_to_u8(distance_type), - num_bits: metadata.num_bits, + distance_type: distance_type_to_proto(self.storage.distance_type()) as i32, + num_bits: metadata.num_bits as u32, code_dim: metadata.code_dim, - query_estimator: metadata.query_estimator, - rotation_type: rotation_type_to_u8(metadata.rotation_type), + rotation_type: rotation_type_to_proto(metadata.rotation_type) as i32, + query_estimator: query_estimator_to_proto(metadata.query_estimator) as i32, fast_rotation_signs: metadata.fast_rotation_signs.clone(), }; - write_json_header(writer, &header)?; + w.write_header(&header)?; + w.write_ipc(&self.index.to_batch()?)?; - write_ipc_stream(&self.index.to_batch()?, writer)?; - - // Write the rotation matrix IPC stream only for Matrix rotation; the - // Fast rotation case stores its signs compactly in the JSON header. + // Write the rotation matrix IPC section only for Matrix rotation; the + // Fast rotation case stores its signs compactly in the proto header. if metadata.rotation_type == RQRotationType::Matrix { let mat = metadata.rotate_mat.as_ref().ok_or_else(|| { Error::io( "RabitQ Matrix metadata missing rotate_mat during serialization".to_string(), ) })?; - write_ipc_stream(&fsl_to_batch(mat, "rotate_mat")?, writer)?; + w.write_ipc(&fsl_to_batch(mat, "rotate_mat")?)?; } - write_ipc_stream_batches(self.storage.to_batches()?, writer)?; + w.write_ipc_batches(self.storage.to_batches()?)?; Ok(()) } - fn deserialize(data: &Bytes) -> Result { - let mut offset = 0; - let header: RabitPartitionHeader = read_json_header(data, &mut offset)?; - let distance_type = u8_to_distance_type(header.distance_type)?; - let rotation_type = u8_to_rotation_type(header.rotation_type)?; + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + let header: RabitPartitionHeader = r.read_header()?; + let distance_type = proto_to_distance_type(header.distance_type()); + let rotation_type = proto_to_rotation_type(header.rotation_type()); - let sub_index_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let sub_index_batch = r.read_ipc()?; let rotate_mat = if rotation_type == RQRotationType::Matrix { - let mat_batch = read_ipc_stream_single_at(data, &mut offset) - .map_err(|e| Error::io(e.to_string()))?; + let mat_batch = r.read_ipc()?; Some(batch_to_fsl(&mat_batch)?) } else { None }; - let storage_batch = - read_ipc_stream_single_at(data, &mut offset).map_err(|e| Error::io(e.to_string()))?; + let storage_batch = read_single_storage_batch(r)?; let index = S::load(sub_index_batch)?; + // Read the proto enum accessor before moving fields out of `header`. + let query_estimator = proto_to_query_estimator(header.query_estimator()); let metadata = RabitQuantizationMetadata { rotate_mat, rotate_mat_position: None, fast_rotation_signs: header.fast_rotation_signs, rotation_type, code_dim: header.code_dim, - num_bits: header.num_bits, + num_bits: header.num_bits as u8, // The storage batch already has packed codes; skip re-packing. packed: true, - query_estimator: header.query_estimator, + query_estimator, }; let storage = ::Storage::try_from_batch( storage_batch, @@ -551,6 +507,21 @@ mod tests { use lance_index::vector::flat::storage::FlatFloatStorage; use lance_index::vector::sq::storage::ScalarQuantizationStorage; + /// Serialize a codec body (no envelope) for tests. + fn ser_body(entry: &T) -> Vec { + let mut buf = Vec::new(); + entry + .serialize(&mut CacheEntryWriter::new(&mut buf)) + .unwrap(); + buf + } + + /// Deserialize a codec body (no envelope) at the current build's version. + fn de_body(bytes: Vec) -> Result { + let data = bytes::Bytes::from(bytes); + T::deserialize(&mut CacheEntryReader::new(&data, 0, T::CURRENT_VERSION)) + } + // ----- PQ helpers ------------------------------------------------------- fn make_test_codebook(dim: usize, num_sub_vectors: usize) -> FixedSizeListArray { @@ -618,12 +589,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -671,12 +639,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.distance_type(), entry.storage.distance_type() @@ -694,12 +658,9 @@ mod tests { storage, }; - let mut serialized = Vec::new(); - entry.serialize(&mut serialized).unwrap(); - let deserialized = PartitionEntry::::deserialize( - &bytes::Bytes::from(serialized), - ) - .unwrap(); + let serialized = ser_body(&entry); + let deserialized = + de_body::>(serialized).unwrap(); assert_eq!(entry.storage, deserialized.storage); } @@ -712,13 +673,9 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); + let mut bytes = ser_body(&entry); bytes.truncate(3); - assert!( - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .is_err() - ); + assert!(de_body::>(bytes).is_err()); } // ----- Flat helpers ----------------------------------------------------- @@ -756,11 +713,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!( restored.storage.metadata().dim, @@ -786,11 +740,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -803,11 +754,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -828,11 +776,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let restored_batch = restored.storage.to_batches().unwrap().next().unwrap(); let schema = restored_batch.schema(); @@ -884,11 +829,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); @@ -914,12 +856,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.distance_type(), dt); } } @@ -960,11 +898,8 @@ mod tests { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); assert_eq!(restored.storage.len(), 30); let orig_ids: Vec = entry.storage.row_ids().copied().collect(); @@ -978,14 +913,27 @@ mod tests { num_rows: usize, code_dim: usize, distance_type: DistanceType, + ) -> ::Storage { + make_rabit_storage( + num_rows, + code_dim, + distance_type, + RQRotationType::Fast, + RabitQueryEstimator::ResidualQuery, + ) + } + + fn make_rabit_storage( + num_rows: usize, + code_dim: usize, + distance_type: DistanceType, + rotation_type: RQRotationType, + query_estimator: RabitQueryEstimator, ) -> ::Storage { use lance_arrow::FixedSizeListArrayExt; - let quantizer = RabitQuantizer::new_with_rotation::( - 1, - code_dim as i32, - RQRotationType::Fast, - ); + let quantizer = + RabitQuantizer::new_with_rotation::(1, code_dim as i32, rotation_type); let values: Vec = (0..num_rows * code_dim) .map(|i| (i % 100) as f32 / 100.0 - 0.5) .collect(); @@ -997,7 +945,8 @@ mod tests { .as_fixed_size_list() .clone(); - let metadata = quantizer.metadata(None); + let mut metadata = quantizer.metadata(None); + metadata.query_estimator = query_estimator; let batch = RecordBatch::try_from_iter(vec![ ( lance_core::ROW_ID, @@ -1044,11 +993,8 @@ mod tests { storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = - PartitionEntry::::deserialize(&bytes::Bytes::from(bytes)) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); let m = entry.storage.metadata(); let rm = restored.storage.metadata(); @@ -1082,22 +1028,125 @@ mod tests { fn test_rabitq_distance_types() { for dt in [DistanceType::L2, DistanceType::Cosine, DistanceType::Dot] { let storage = make_rabit_storage_fast(10, 32, dt); - let expected_distance_type = if dt == DistanceType::Cosine { - DistanceType::L2 - } else { - dt - }; let entry = PartitionEntry:: { index: FlatIndex::default(), storage, }; - let mut bytes = Vec::new(); - entry.serialize(&mut bytes).unwrap(); - let restored = PartitionEntry::::deserialize( - &bytes::Bytes::from(bytes), - ) + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + // The codec round-trips the distance type faithfully. + assert_eq!( + restored.storage.distance_type(), + entry.storage.distance_type() + ); + } + } + + #[test] + fn test_roundtrip_rabitq_raw_query_estimator() { + // The query estimator is a non-default value here; it must survive the + // round trip so raw-query search keeps working after a cache reload. + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Fast, + RabitQueryEstimator::RawQuery, + ); + assert_eq!( + storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + assert_eq!( + restored.storage.metadata().query_estimator, + RabitQueryEstimator::RawQuery + ); + } + + /// Matrix rotation writes an extra `rotate_mat` IPC section between the + /// sub-index and storage sections; exercise that the codec preserves it. + #[test] + fn test_roundtrip_flat_rabitq_matrix() { + let storage = make_rabit_storage( + 40, + 32, + DistanceType::L2, + RQRotationType::Matrix, + RabitQueryEstimator::ResidualQuery, + ); + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage, + }; + + let bytes = ser_body(&entry); + let restored = de_body::>(bytes).unwrap(); + + let m = entry.storage.metadata(); + let rm = restored.storage.metadata(); + assert_eq!(rm.rotation_type, RQRotationType::Matrix); + assert_eq!(rm.code_dim, m.code_dim); + assert_eq!(rm.num_bits, m.num_bits); + // The rotation matrix itself must survive the round trip. + let orig_mat = m + .rotate_mat + .as_ref() + .expect("matrix rotation has rotate_mat"); + let rest_mat = rm + .rotate_mat + .as_ref() + .expect("restored matrix rotation has rotate_mat"); + assert_eq!( + orig_mat.values().as_primitive::().values(), + rest_mat.values().as_primitive::().values(), + ); + } + + /// SQ storage (a multi-batch IPC section) must decode zero-copy through the + /// full envelope even though the proto header and sub-index section push it + /// to a non-aligned starting offset. + #[test] + fn test_partition_storage_is_zero_copy_through_envelope() { + use lance_core::cache::CacheCodec; + const ALIGN: usize = 64; + + let entry = PartitionEntry:: { + index: FlatIndex::default(), + storage: make_sq_storage(64, 32, DistanceType::L2), + }; + let codec = CacheCodec::from_impl::>(); + let any: Arc = Arc::new(entry); + let mut buf = Vec::new(); + codec.serialize(&any, &mut buf).unwrap(); + + let mut v = vec![0u8; buf.len() + ALIGN]; + let pad = (ALIGN - (v.as_ptr() as usize % ALIGN)) % ALIGN; + v[pad..pad + buf.len()].copy_from_slice(&buf); + let data = bytes::Bytes::from(v).slice(pad..pad + buf.len()); + + let restored = codec.deserialize(&data).hit().unwrap(); + let restored = restored + .downcast::>() .unwrap(); - assert_eq!(restored.storage.distance_type(), expected_distance_type); + + let base = data.as_ptr() as usize; + let end = base + data.len(); + let first = restored.storage.to_batches().unwrap().next().unwrap(); + for col in first.columns() { + for buffer in col.to_data().buffers() { + let ptr = buffer.as_ptr() as usize; + assert!( + ptr >= base && ptr < end, + "storage buffer was realigned out of the input — misaligned IPC section", + ); + } } } @@ -1135,17 +1184,12 @@ mod tests { let entry = IvfStateEntryBox(Arc::new(state)); - let mut bytes = Vec::new(); - CacheCodecImpl::serialize(&entry, &mut bytes).unwrap(); - - let restored = - ::deserialize(&bytes::Bytes::from(bytes.clone())) - .unwrap(); + let bytes = ser_body(&entry); + let restored = de_body::(bytes.clone()).unwrap(); // Re-serialize the restored entry and compare bytes — a stronger check // than field-by-field comparison and avoids needing to downcast. - let mut restored_bytes = Vec::new(); - CacheCodecImpl::serialize(&restored, &mut restored_bytes).unwrap(); + let restored_bytes = ser_body(&restored); assert_eq!(bytes, restored_bytes); } } diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 2c91f6311ab..5b29752f7c1 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -3,7 +3,6 @@ //! IVF - Inverted File index. -use std::io::Write as IoWrite; use std::marker::PhantomData; use std::{ any::Any, @@ -26,8 +25,10 @@ use futures::future::BoxFuture; use futures::prelude::stream::{self, TryStreamExt}; use futures::{StreamExt, TryFutureExt}; use lance_arrow::RecordBatchExt; -use lance_arrow::ipc::write_len_prefixed_bytes; -use lance_core::cache::{CacheCodec, CacheCodecImpl, CacheKey, LanceCache, WeakLanceCache}; +use lance_core::cache::{ + CacheCodec, CacheCodecImpl, CacheEntryReader, CacheEntryWriter, CacheKey, LanceCache, + WeakLanceCache, +}; use lance_core::deepsize::DeepSizeOf; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_VECTOR_PART, TRACE_IO_EVENTS}; @@ -35,6 +36,7 @@ use lance_core::{Error, ROW_ID, Result}; use lance_encoding::decoder::{DecoderPlugins, FilterExpression}; use lance_file::LanceEncodingsIo; use lance_file::reader::{CachedFileMetadata, FileReader, FileReaderOptions}; +use lance_index::cache_pb::IvfStateHeader; use lance_index::frag_reuse::FragReuseIndex; use lance_index::metrics::{LocalMetricsCollector, MetricsCollector, NoOpMetricsCollector}; use lance_index::vector::VectorIndexCacheEntry; @@ -217,28 +219,6 @@ impl DeepSizeOf for IvfIndexState { } } -/// Serialization header for the `IvfIndexState` wire format. -/// -/// Kept as a flat, non-generic struct so the JSON header format is stable -/// regardless of `Q`. `quantizer_metadata_json` holds the serialized -/// `Q::Metadata`; large blobs (PQ codebook, RQ matrix) follow as raw bytes. -#[derive(serde::Serialize, serde::Deserialize)] -struct IvfIndexStateHeader { - index_file_path: String, - uuid: String, - distance_type: String, - sub_index_metadata: Vec, - sub_index_type: String, - quantization_type: String, - quantizer_metadata_json: String, - #[serde(default)] - cache_key_prefix: String, - #[serde(default)] - index_file_size: u64, - #[serde(default)] - aux_file_size: u64, -} - /// Object-safe interface for a type-erased `IvfIndexState`. /// /// Stored as `Arc` inside [`IvfStateEntryBox`], which is @@ -246,7 +226,7 @@ struct IvfIndexStateHeader { /// wrapper lets the cache infrastructure work with a sized type while the /// hot paths call `reconstruct` without knowing `Q`. pub(crate) trait IvfStateEntry: DeepSizeOf + Send + Sync + 'static { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()>; + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()>; fn reconstruct<'a>( &'a self, @@ -271,42 +251,39 @@ impl DeepSizeOf for IvfStateEntryBox { } } -/// Wire format (unchanged from the non-generic `IvfIndexState`): -/// `[header_json_len: u64 LE][header JSON][ivf_pb_len: u64 LE][ivf protobuf] -/// [extra_len: u64 LE][extra bytes][aux_ivf_pb_len: u64 LE][aux_ivf protobuf]` +/// Wire format: +/// ```text +/// HEADER : IvfStateHeader proto (paths, types, quantizer metadata JSON) +/// RAW_BLOB : IVF model protobuf +/// RAW_BLOB : quantizer extra-metadata buffer (may be empty) +/// RAW_BLOB : auxiliary IVF model protobuf +/// ``` impl CacheCodecImpl for IvfStateEntryBox { - fn serialize(&self, writer: &mut dyn IoWrite) -> Result<()> { - self.0.serialize_state(writer) - } + const TYPE_ID: &'static str = "lance.vector.ivf.IvfState"; + const CURRENT_VERSION: u32 = 1; - fn deserialize(data: &bytes::Bytes) -> Result { - use lance_arrow::ipc::read_len_prefixed_bytes_at; + fn serialize(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { + self.0.serialize_state(w) + } - // Parse the common wire format, then dispatch on quantization_type to + fn deserialize(r: &mut CacheEntryReader<'_>) -> Result { + // Parse the common header, then dispatch on quantization_type to // construct the right IvfIndexState. - let mut offset = 0; - let header_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; - let header: IvfIndexStateHeader = serde_json::from_slice(&header_bytes) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; + let header: IvfStateHeader = r.read_header()?; - let ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let ivf_bytes = r.read_raw()?; let ivf = IvfModel::try_from( pb::Ivf::decode(ivf_bytes.as_ref()) .map_err(|e| lance_core::Error::io(format!("IvfIndexState IVF decode: {e}")))?, )?; - let extra_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let extra_bytes = r.read_raw()?; - // aux_ivf was added after initial deployment; fall back to ivf on - // clean EOF (legacy format without the field). - let aux_ivf = if offset + 8 <= data.len() { - let aux_ivf_bytes = read_len_prefixed_bytes_at(data, &mut offset)?; + let aux_ivf_bytes = r.read_raw()?; + let aux_ivf = IvfModel::try_from(pb::Ivf::decode(aux_ivf_bytes.as_ref()).map_err(|e| { lance_core::Error::io(format!("IvfIndexState aux IVF decode: {e}")) - })?)? - } else { - ivf.clone() - }; + })?)?; let distance_type = DistanceType::try_from(header.distance_type.as_str())?; let sub_index_type = SubIndexType::try_from(header.sub_index_type.as_str())?; @@ -315,7 +292,7 @@ impl CacheCodecImpl for IvfStateEntryBox { // Helper: parse Q::Metadata from the JSON+extra_bytes in the header, // then build an IvfStateEntryBox wrapping IvfIndexState. fn make_entry( - header: IvfIndexStateHeader, + header: IvfStateHeader, ivf: IvfModel, aux_ivf: IvfModel, extra_bytes: bytes::Bytes, @@ -401,13 +378,13 @@ impl CacheCodecImpl for IvfStateEntryBox { } impl IvfStateEntry for IvfIndexState { - fn serialize_state(&self, writer: &mut dyn IoWrite) -> Result<()> { + fn serialize_state(&self, w: &mut CacheEntryWriter<'_>) -> Result<()> { let quantizer_metadata_json = serde_json::to_string(&self.metadata) .map_err(|e| lance_core::Error::io(format!("IvfIndexState metadata: {e}")))?; let extra = self.metadata.extra_metadata()?; let extra = extra.as_deref().unwrap_or(&[]); - let header = IvfIndexStateHeader { + let header = IvfStateHeader { index_file_path: self.index_file_path.clone(), uuid: self.uuid.to_string(), distance_type: self.distance_type.to_string(), @@ -419,15 +396,13 @@ impl IvfStateEntry for IvfIndexState { index_file_size: self.index_file_size, aux_file_size: self.aux_file_size, }; - let header_json = serde_json::to_vec(&header) - .map_err(|e| lance_core::Error::io(format!("IvfIndexState header: {e}")))?; let ivf_bytes = pb::Ivf::try_from(&self.ivf)?.encode_to_vec(); let aux_ivf_bytes = pb::Ivf::try_from(&self.aux_ivf)?.encode_to_vec(); - write_len_prefixed_bytes(writer, &header_json)?; - write_len_prefixed_bytes(writer, &ivf_bytes)?; - write_len_prefixed_bytes(writer, extra)?; - write_len_prefixed_bytes(writer, &aux_ivf_bytes)?; + w.write_header(&header)?; + w.write_raw(&ivf_bytes)?; + w.write_raw(extra)?; + w.write_raw(&aux_ivf_bytes)?; Ok(()) } @@ -6240,11 +6215,9 @@ mod tests { // Try serialized store first let guard = self.serialized.lock().await; if let Some((bytes, stored_codec, _)) = guard.get(key) { - return Some( - stored_codec - .deserialize(&bytes::Bytes::copy_from_slice(bytes)) - .expect("deserialization should succeed"), - ); + return stored_codec + .deserialize(&bytes::Bytes::copy_from_slice(bytes)) + .hit(); } drop(guard); // Fall through to passthrough From 9813867b32f9ec0c75508c38ff4cbd80e875b620 Mon Sep 17 00:00:00 2001 From: LuQQiu Date: Wed, 17 Jun 2026 10:15:38 -0700 Subject: [PATCH 124/177] perf(fts): push prefilter through scalar index on flat FTS (#7283) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Flat FTS (no inverted index on the text column) used `scan_fragments` + a manual `LanceFilterExec` to apply the prefilter, bypassing any scalar index on the filter column. - Route the scan through `filtered_read` instead — same pattern as the brute-force KNN path at `scanner.rs:3839-3848` — so the pushable part of the prefilter is evaluated inside `FilteredReadExec` (using a scalar index when one exists) and only the unpushable `refine_expr` is reapplied on top. - Requires `prefilter(true)` on the scanner. The postfilter branch passes an empty `ExprFilterPlan::default()` down the FTS path and is unaffected. ## Plan shape Before (with `WHERE id = 1` and a BTree on `id`, no FTS index on `text`): ``` FlatMatchQueryExec └── LanceFilterExec(id = 1) # post-scan filter, BTree unused └── LanceScan(columns=[text, id], with_row_id) ``` After: ``` FlatMatchQueryExec └── FilteredReadExec # full_filter=id = Int32(1), BTree used columns=[text], with_row_id # refine_expr (if any) reapplied as LanceFilterExec on top ``` ## Test plan - [x] New `test_fts_without_index_uses_scalar_index_for_prefilter` in `dataset_index.rs`: BTree on `id`, no FTS index, flat FTS + `id = 1` prefilter with `.prefilter(true)`. Asserts via `analyze_plan` that `LanceRead` shows `full_filter=id = Int32(1)`, no `LanceScan:` in the plan, and the result set is correct (2 rows). - [x] Full FTS test suite passes (123 tests). - [x] `cargo fmt --all` - [x] `cargo clippy --all --tests --benches -- -D warnings` --------- Co-authored-by: Claude Opus 4.7 --- python/python/tests/test_scalar_index.py | 5 +- rust/lance/src/dataset/scanner.rs | 78 +++++++++++++------ rust/lance/src/dataset/tests/dataset_index.rs | 72 +++++++++++++++++ 3 files changed, 129 insertions(+), 26 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 7ddfbbc0dc8..13b3de74838 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -648,7 +648,10 @@ def make_fts_search(ds): assert "ScalarIndexQuery" in plan assert "MaterializeIndex" not in plan assert "FlatMatchQuery" in plan - assert "LanceScan" in plan + # Flat FTS now reads via FilteredReadExec (prints as `LanceRead`) so the + # BTree on `id` pushes into the unindexed-fragment scan too. + assert "LanceRead" in plan + assert "LanceScan" not in plan assert make_fts_search(ds).to_table().num_rows == 12 # Update vector index but NOT scalar index diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 2d75104d26e..09cd7023e74 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -3591,33 +3591,35 @@ impl Scanner { .clone(); let mut columns = vec![column]; - if let Some(expr) = filter_plan.full_expr.as_ref() { - let filter_columns = Planner::column_names_in_expr(expr); - columns.extend(filter_columns); + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + columns.extend(Planner::column_names_in_expr(refine_expr)); } - let flat_fts_scan_schema = Arc::new(self.dataset.schema().project(&columns).unwrap()); - let mut scan_node = self.scan_fragments( - true, - false, - false, - false, - false, - flat_fts_scan_schema, - Arc::new(fragments), - None, - false, - ); + let scan_projection = self + .dataset + .empty_projection() + .with_row_id() + .union_columns(&columns, OnMissing::Error)?; - if let Some(expr) = filter_plan.full_expr.as_ref() { - // If there is a prefilter we need to manually apply it to the new data - scan_node = Arc::new(LanceFilterExec::try_new(expr.clone(), scan_node)?); + let PlannedFilteredScan { mut plan, .. } = self + .filtered_read( + filter_plan, + scan_projection, + /*make_deletions_null=*/ false, + Some(Arc::new(fragments)), + None, + /*is_prefilter=*/ true, + ) + .await?; + + if let Some(refine_expr) = filter_plan.refine_expr.as_ref() { + plan = Arc::new(LanceFilterExec::try_new(refine_expr.clone(), plan)?); } let flat_match_plan = Arc::new(FlatMatchQueryExec::new( self.dataset.clone(), query.clone(), params.clone(), - scan_node, + plan, )); Ok(flat_match_plan) } @@ -10470,7 +10472,12 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows"); - let expected = r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + // The flat-FTS path now reads through `FilteredReadExec`, matching the + // brute-force KNN path. With no prefilter the scan still produces no + // pushdown, but the operator differs by storage version: legacy emits + // a `LanceScan`, v2 emits a `LanceRead` with empty filters. + let expected = if data_storage_version == LanceFileVersion::Legacy { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" CoalesceBatchesExec: target_batch_size=8192 SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] @@ -10478,7 +10485,18 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") UnionExec MatchQuery: column=s, query=hello FlatMatchQuery: column=s, query=hello - LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=false, range=None"#; + LanceScan: uri=..., projection=[s], row_id=true, row_addr=false, ordered=true, range=None"# + } else { + r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] + Take: columns="_rowid, _score, (s)" + CoalesceBatchesExec: target_batch_size=8192 + SortExec: expr=[_score@1 DESC NULLS LAST], preserve_partitioning=[false] + CoalescePartitionsExec + UnionExec + MatchQuery: column=s, query=hello + FlatMatchQuery: column=s, query=hello + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=--, refine_filter=--"# + }; dataset.append_new_data().await?; assert_plan_equals( &dataset.dataset, @@ -10511,6 +10529,10 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") .await?; log::info!("Test case: Full text search with unindexed rows and prefilter"); + // After routing flat FTS through `FilteredReadExec`, the BTree on `i` + // pushes into the unindexed-fragment scan too — no more `FilterExec` on + // top of an unfiltered `LanceScan`. Legacy uses the `MaterializeIndex` + // shape, v2 uses `LanceRead` with `full_filter` set. let expected = if data_storage_version == LanceFileVersion::Legacy { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" @@ -10526,8 +10548,14 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") FilterExec: i@0 > 10 LanceScan: uri=..., projection=[i], row_id=true, row_addr=false, ordered=false, range=None FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + CoalescePartitionsExec + UnionExec + Take: columns="_rowid, (s)" + CoalesceBatchesExec: target_batch_size=8192 + MaterializeIndex: query=[i > 10]@i_idx(BTree) + ProjectionExec: expr=[_rowid@2 as _rowid, s@1 as s] + FilterExec: i@0 > 10 + LanceScan: uri=..., projection=[i, s], row_id=true, row_addr=false, ordered=false, range=None"# } else { r#"ProjectionExec: expr=[s@2 as s, _score@1 as _score, _rowid@0 as _rowid] Take: columns="_rowid, _score, (s)" @@ -10539,8 +10567,8 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") LanceRead: uri=..., projection=[], num_fragments=5, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- ScalarIndexQuery: query=[i > 10]@i_idx(BTree) FlatMatchQuery: column=s, query=hello - FilterExec: i@1 > 10 - LanceScan: uri=..., projection=[s, i], row_id=true, row_addr=false, ordered=false, range=None"# + LanceRead: uri=..., projection=[s], num_fragments=1, range_before=None, range_after=None, row_id=true, row_addr=false, full_filter=i > Int32(10), refine_filter=-- + ScalarIndexQuery: query=[i > 10]@i_idx(BTree)"# }; assert_plan_equals( &dataset.dataset, diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index d5c4493c8a8..267296c984b 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -1137,6 +1137,78 @@ async fn test_fts_without_index() { assert_eq!(results.num_rows(), 1); } +#[tokio::test] +async fn test_fts_without_index_uses_scalar_index_for_prefilter() { + // Verify that flat FTS (no inverted index on text) routes its prefilter + // through `FilteredReadExec` so a scalar index on the filter column is + // actually used. Six rows with two distinct ids: a prefilter of `id = 1` + // must match exactly the three text rows tagged with id=1. + let text = StringArray::from(vec![ + "alpha bravo", + "charlie delta", + "alpha echo", + "foxtrot", + "alpha golf", + "hotel india", + ]); + let ids = Int32Array::from(vec![1, 1, 1, 2, 2, 2]); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![ + Field::new("text", text.data_type().to_owned(), false), + Field::new("id", ids.data_type().to_owned(), false), + ]) + .into(), + vec![Arc::new(text) as ArrayRef, Arc::new(ids) as ArrayRef], + ) + .unwrap(); + let schema = batch.schema(); + let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + let test_uri = TempStrDir::default(); + let mut dataset = Dataset::write(batches, &test_uri, None).await.unwrap(); + + // Scalar index on `id` only — no FTS index on `text`. + dataset + .create_index( + &["id"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + let mut scan = dataset.scan(); + scan.prefilter(true) + .full_text_search( + FullTextSearchQuery::new("alpha".to_owned()) + .with_columns(&["text".to_string()]) + .unwrap(), + ) + .unwrap() + .filter("id = 1") + .unwrap(); + + let plan = scan.analyze_plan().await.unwrap(); + // The flat-FTS path now reads via `FilteredReadExec` (prints as `LanceRead`) + // with the prefilter plumbed into it, so the scalar index on `id` is used. + assert_contains!(&plan, "FlatMatchQuery"); + assert_contains!(&plan, "LanceRead"); + assert_contains!(&plan, "full_filter=id = Int32(1)"); + // The legacy plan ran a `LanceScan` wrapped in a manual `LanceFilterExec`; + // make sure we did not regress to that shape. + assert_not_contains!(&plan, "LanceScan:"); + + let results = scan.try_into_batch().await.unwrap(); + // Only rows with id=1 AND text matching "alpha": rows 0 ("alpha bravo") + // and 2 ("alpha echo"). Row 4 ("alpha golf") has id=2 and must be excluded. + assert_eq!( + results.num_rows(), + 2, + "expected the two id=1 rows that match `alpha`, got plan:\n{plan}" + ); +} + #[tokio::test] async fn test_fts_rank() { let params = InvertedIndexParams::default(); From d9ab79f705d7cf69542817d3c3a0b0f691008fd7 Mon Sep 17 00:00:00 2001 From: XY Zhan Date: Wed, 17 Jun 2026 14:22:09 -0400 Subject: [PATCH 125/177] fix(index): refresh PQ storage row ids after fragment-reuse remap (#7315) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Fixes a correctness bug where **IVF_PQ (and IVF_HNSW_PQ) return stale row addresses after a deferred compaction** (`defer_index_remap`), and adds correctness coverage across the deferred-compaction lifecycle for the vector index types. `ProductQuantizationStorage::new` remaps its PQ codes through the fragment-reuse index but left the `row_ids` field bound to the **pre-remap** addresses. So once an index is loaded from a version carrying a fragment-reuse index, an IVF_PQ / IVF_HNSW_PQ query returns compacted-away row addresses and the scanner take fails with: ``` The input to a take operation specified fragment id N but this fragment does not exist in the dataset ``` This happens even for **merge-only** compaction and is only observable when the query **fetches row content** (a take) — the existing `test_read_ivf_pq_index_v3_with_defer_index_remap` projects no columns (counts row ids without taking), so it didn't catch it. IVF_FLAT/SQ/RQ already refresh their row ids correctly. The fix refreshes `row_ids` from the remapped batch alongside `pq_code`. ## Test coverage added Parameterized helpers that, unlike the existing `test_read_*_with_defer_index_remap` tests, **project and fetch row content** so stale addresses surface: - `check_vector_defer_compaction` — fragment-reuse window correctness, with and without **materialized deletions**. - `check_vector_remap_and_trim` — merge-only, then **physical `remap_column_index` + `cleanup_frag_reuse_index`**, asserting the reuse index trims to zero versions and results stay consistent. Covered (passing): IVF_FLAT, IVF_SQ, IVF_RQ, IVF_PQ across window/deletions/remap+trim; IVF_HNSW_SQ/PQ merge-only; a scalar bitmap deletion case. Two known gaps are intentionally **not** turned into perpetually-ignored tests (they're noted in code comments instead): IVF_HNSW_* and the inverted/FTS index desync **under materialized deletions** because their positional internal structures (HNSW graph node ids; FTS `num_tokens[doc_id]`) are not realigned when the fragment-reuse drop removes rows. The HNSW case is the deferred #3993; the FTS case is a separate follow-up. ## Testing ``` cargo test -p lance --lib defer_compaction # 6 passed cargo test -p lance --lib remap_and_trim # 6 passed ``` --- rust/lance-index/src/vector/pq/storage.rs | 7 +- rust/lance/src/dataset/optimize.rs | 543 ++++++++++++++++++++++ 2 files changed, 549 insertions(+), 1 deletion(-) diff --git a/rust/lance-index/src/vector/pq/storage.rs b/rust/lance-index/src/vector/pq/storage.rs index 68747713aac..de5a7ac28bd 100644 --- a/rust/lance-index/src/vector/pq/storage.rs +++ b/rust/lance-index/src/vector/pq/storage.rs @@ -221,7 +221,7 @@ impl ProductQuantizationStorage { "Row ID column not found from PQ storage".to_string(), )); }; - let row_ids: Arc = row_ids + let mut row_ids: Arc = row_ids .as_primitive_opt::() .ok_or(Error::index( "Row ID column is not of type UInt64".to_string(), @@ -293,6 +293,11 @@ impl ProductQuantizationStorage { .as_primitive::() .clone() .into(); + // Refresh the stored row ids from the remapped batch. Without this + // the storage keeps the pre-remap (compacted-away) addresses while + // its codes are remapped, so search returns stale row ids and the + // take fails with "fragment ... does not exist". + row_ids = batch[ROW_ID].as_primitive::().clone().into(); } let distance_type = match distance_type { diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 4b5f3505f69..274fde9955e 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -4809,6 +4809,549 @@ mod tests { assert!(checked > 0, "expected to check at least one stored vector"); } + /// Build an `id` + `vec` dataset, create the given IVF vector index, + /// optionally delete rows, then run deferred compaction (which materializes + /// the deletions into the fragment-reuse index) and assert that KNN over + /// surviving vectors during the FRI window (a) never returns a deleted row + /// and (b) stays consistent with the pre-compaction answer. + /// + /// The deletion path is the interesting one: materialized deletions drop + /// rows from the quantization storage at load time, which shifts storage + /// positions. Flat storage (FLAT/PQ/SQ/RQ) is scanned linearly so this is + /// fine, but the HNSW graph addresses storage positionally and is not + /// frag-reuse aware, so a desync would surface here as recall collapse or a + /// resurrected/again-deleted row. + /// Top-k `id`s for a KNN query against the `vec` column. + async fn vector_knn_ids(dataset: &Dataset, query: &[f32], k: usize) -> Vec { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + let qa = PrimitiveArray::::from_iter_values(query.iter().copied()); + let mut scanner = dataset.scan(); + scanner.nearest("vec", &qa, k).unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids = Vec::new(); + for b in &batches { + ids.extend(b["id"].as_primitive::().values().iter().copied()); + } + ids + } + + async fn check_vector_defer_compaction( + params: VectorIndexParams, + delete_predicate: Option<&str>, + k: usize, + min_overlap: usize, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + if let Some(pred) = delete_predicate { + dataset.delete(pred).await.unwrap(); + } + + // Collect surviving (id, vec) pairs and the set of surviving ids. + let mut survivors: Vec<(i32, Vec)> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["id", "vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let ids = batch["id"].as_primitive::(); + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + let v = v.as_primitive::().values().to_vec(); + survivors.push((ids.value(i), v)); + } + } + } + assert!(!survivors.is_empty()); + let surviving_ids: std::collections::HashSet = + survivors.iter().map(|(id, _)| *id).collect(); + + // Sample queries from survivors and capture the pre-compaction answer. + let step = (survivors.len() / 16).max(1); + let queries: Vec<(i32, Vec)> = survivors.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for (_, q) in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Deferred compaction materializes the deletions into the frag-reuse index. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + + // During the FRI window: no deleted rows, and stable vs the baseline. + for (i, (_, q)) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + for id in &after { + assert!( + surviving_ids.contains(id), + "KNN returned id {id} that is not a surviving row (query #{i})" + ); + } + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "KNN top-{k} diverged after deferred compaction: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + + fn small_ivf() -> lance_index::vector::ivf::IvfBuildParams { + lance_index::vector::ivf::IvfBuildParams { + max_iters: 2, + num_partitions: Some(2), + sample_rate: 2, + ..Default::default() + } + } + + #[tokio::test] + async fn test_ivf_flat_defer_compaction_with_deletions() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + // Flat storage is scanned linearly; dropping deleted rows is exact. + check_vector_defer_compaction(params, Some("id < 1500"), 10, 10).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_defer_compaction_merge_only() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // No deletions: storage positions are stable, so the graph stays aligned. + check_vector_defer_compaction(params, None, 10, 9).await; + } + + // NOTE: IVF_HNSW_* under materialized deletions is a known gap (lance#3993, + // HNSW auto-remap not implemented) — the HNSW graph isn't realigned after the + // frag-reuse drop. Deferred remap is gated off for HNSW tables, so there is + // no lance-level reproducer here; the gate is tested in the data plane. + // Merge-only HNSW is covered (see the *_remap_and_trim tests). + + #[tokio::test] + async fn test_ivf_pq_defer_compaction_with_deletions() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_sq_defer_compaction_with_deletions() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + #[tokio::test] + async fn test_ivf_rq_defer_compaction_with_deletions() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_defer_compaction(params, Some("id < 1500"), 10, 8).await; + } + + /// Merge-only deferred compaction, then a PHYSICAL remap + FRI trim. Asserts + /// the index is rebuilt, the fragment-reuse index trims to zero versions, + /// and KNN stays consistent with the pre-compaction answer through both the + /// FRI window and the physical remap. (HNSW rebuilds its graph on physical + /// remap, so the overlap is recall-tolerant.) + async fn check_vector_remap_and_trim( + params: VectorIndexParams, + k: usize, + window_overlap: usize, + post_remap_overlap: Option, + ) { + use arrow_array::cast::AsArray; + use arrow_array::types::{Float32Type, Int32Type}; + use lance_datagen::Dimension; + + const DIM: u32 = 32; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "vec", + lance_datagen::array::rand_vec::(Dimension::from(DIM)), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["vec"], + IndexType::Vector, + Some("vec_idx".into()), + ¶ms, + false, + ) + .await + .unwrap(); + let original_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + + // Sample queries from stored vectors + capture the pre-compaction answer. + let mut rows: Vec> = Vec::new(); + { + let mut scanner = dataset.scan(); + scanner.project(&["vec"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + for batch in &batches { + let vecs = batch["vec"].as_fixed_size_list(); + for i in 0..batch.num_rows() { + let v = vecs.value(i); + rows.push(v.as_primitive::().values().to_vec()); + } + } + } + let step = (rows.len() / 16).max(1); + let queries: Vec> = rows.iter().step_by(step).cloned().collect(); + let mut baseline: Vec> = Vec::new(); + for q in &queries { + baseline.push(vector_knn_ids(&dataset, q, k).await); + } + + // Merge-only deferred compaction. + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert_eq!( + dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid, + original_uuid, + "index must not be physically remapped yet (FRI window)" + ); + for (i, q) in queries.iter().enumerate() { + let window = vector_knn_ids(&dataset, q, k).await; + let overlap = window.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= window_overlap, + "FRI-window KNN diverged: overlap {overlap} < {window_overlap} (query #{i})" + ); + } + + // Physical remap + trim the fragment-reuse index. + remapping::remap_column_index(&mut dataset, &["vec"], Some("vec_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + + let remapped_uuid = dataset + .load_index_by_name("vec_idx") + .await + .unwrap() + .unwrap() + .uuid; + assert_ne!( + remapped_uuid, original_uuid, + "index should have been physically remapped" + ); + if let Some(meta) = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + { + let versions = load_frag_reuse_index_details(&dataset, &meta) + .await + .unwrap() + .versions + .len(); + assert_eq!(versions, 0, "frag-reuse index must trim to zero versions"); + } + + for (i, q) in queries.iter().enumerate() { + let after = vector_knn_ids(&dataset, q, k).await; + // No stale/desynced addresses (a bad address fails the take above). + assert!( + !after.is_empty(), + "post-remap KNN returned no rows (query #{i})" + ); + // Physical remap rebuilds the HNSW graph, so recall is only compared + // for the exact (non-HNSW) types. + if let Some(min_overlap) = post_remap_overlap { + let overlap = after.iter().filter(|id| baseline[i].contains(id)).count(); + assert!( + overlap >= min_overlap, + "post-remap KNN diverged: overlap {overlap} < {min_overlap} (query #{i})" + ); + } + } + } + + #[tokio::test] + async fn test_ivf_flat_remap_and_trim() { + let params = VectorIndexParams::with_ivf_flat_params(DistanceType::L2, small_ivf()); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + // Regression: PQ storage used to remap its codes through the frag-reuse + // index but keep the pre-remap `row_ids` field, so search returned stale + // (compacted-away) addresses and the take failed with "fragment ... does + // not exist" — even merge-only, and only observable when the query fetches + // row content (the existing `test_read_ivf_pq_index_v3_with_defer_index_remap` + // projects no columns, so it never takes and missed this). + #[tokio::test] + async fn test_ivf_pq_remap_and_trim() { + use lance_index::vector::pq::PQBuildParams; + let params = VectorIndexParams::with_ivf_pq_params( + DistanceType::L2, + small_ivf(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_sq_remap_and_trim() { + use lance_index::vector::sq::builder::SQBuildParams; + let params = VectorIndexParams::with_ivf_sq_params( + DistanceType::L2, + small_ivf(), + SQBuildParams::default(), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_rq_remap_and_trim() { + use lance_index::vector::bq::RQBuildParams; + let params = VectorIndexParams::with_ivf_rq_params( + DistanceType::L2, + small_ivf(), + RQBuildParams::new(1), + ); + check_vector_remap_and_trim(params, 10, 8, Some(8)).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_sq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, sq::builder::SQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_sq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + SQBuildParams::default(), + ); + // Physical remap rebuilds the HNSW graph, so use a recall-tolerant overlap. + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + #[tokio::test] + async fn test_ivf_hnsw_pq_remap_and_trim() { + use lance_index::vector::{hnsw::builder::HnswBuildParams, pq::PQBuildParams}; + let params = VectorIndexParams::with_ivf_hnsw_pq_params( + DistanceType::L2, + small_ivf(), + HnswBuildParams::default(), + PQBuildParams { + max_iters: 2, + num_sub_vectors: 2, + ..Default::default() + }, + ); + check_vector_remap_and_trim(params, 10, 7, None).await; + } + + // Scalar index correctness across deferred compaction WITH materialized + // deletions. The existing test_read_*_index_with_defer_index_remap tests are + // merge-only and project no columns (count-only), so they never take and + // never exercise the deletion drop path. These add an `id` column, delete a + // prefix, defer-compact, then run the indexed query *projecting id* (a take) + // and assert no deleted row is returned. Bitmap/BTree have no positional + // internal structure so the drop path is exact; the Inverted (FTS) index + // does (see its test below), and currently desyncs under deletions. + + #[tokio::test] + async fn test_bitmap_index_defer_compaction_with_deletions() { + use arrow_array::cast::AsArray; + use arrow_array::types::Int32Type; + let mut dataset = lance_datagen::gen_batch() + .col("id", lance_datagen::array::step::()) + .col( + "category", + lance_datagen::array::cycle::(vec![1, 2, 3]), + ) + .into_ram_dataset(FragmentCount::from(6), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["category"], + IndexType::Bitmap, + Some("category_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + dataset.delete("id < 1500").await.unwrap(); + let metrics = compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!(metrics.fragments_removed > 0); + assert!( + dataset + .load_indices() + .await + .unwrap() + .iter() + .any(|idx| idx.name == FRAG_REUSE_INDEX_NAME), + "deferred compaction must record a frag-reuse index" + ); + + let mut scanner = dataset.scan(); + scanner.filter("category = 3").unwrap(); + scanner.project(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut returned = 0; + for b in &batches { + for id in b["id"].as_primitive::().values() { + assert!( + *id >= 1500, + "bitmap returned deleted id {id} in the FRI window" + ); + returned += 1; + } + } + assert!(returned > 0, "expected surviving category=3 rows"); + } + + // NOTE: Inverted/FTS under materialized deletions is broken (BM25 scores + // via positional num_tokens[doc_id]; the frag-reuse drop shifts doc_id + // positions -> out-of-bounds). It is gated off defer in the data plane + // until fixed, so there is no lance-level reproducer here. Merge-only FTS + // is covered by test_read_inverted_index_with_defer_index_remap. + #[tokio::test] async fn test_default_compaction_planner() { let test_dir = TempStrDir::default(); From 881892592e7b72e67daa8847db15b8ab5cfd3146 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Wed, 17 Jun 2026 11:24:34 -0700 Subject: [PATCH 126/177] feat(dir-catalog): add reader/writer feature flags to __manifest (#7191) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds forward-compatibility infrastructure to the directory-catalog `__manifest` dataset, mirroring the Lance table format's reader/writer feature flags but at the catalog-manifest layer. - Persists two `u64` bitmasks in the `__manifest` dataset's `table_metadata` (`lance.namespace.manifest.reader_feature_flags` / `writer_feature_flags`). Absent keys parse as `0`, so every existing manifest stays universally compatible. - A build refuses to read or write a manifest that sets a flag it does not understand, returning a clear "please upgrade" error instead of misreading it. Reader and writer checks are enforced centrally: in the manifest consistency wrapper, at catalog open, and on the copy-on-write mutation path. - Also stops the directory catalog from silently degrading to directory listing when the manifest is incompatible — `build()` and the per-operation fallbacks propagate the incompatibility instead of masking it, so the check cannot be bypassed. This is the **mechanism only**: no manifest feature is defined yet, so the known masks are `0` and nothing is ever set — **zero behavior change** today. It is the prerequisite so that a future `__manifest` format change (e.g. a schema migration) can be shipped safely: that change adds its bit to the known masks and stamps it on write, and from then on older clients refuse the new format instead of misreading it. --- rust/lance-namespace-impls/src/dir.rs | 17 ++ .../lance-namespace-impls/src/dir/manifest.rs | 189 ++++++++++++++++- .../src/dir/manifest_feature_flags.rs | 194 ++++++++++++++++++ 3 files changed, 394 insertions(+), 6 deletions(-) create mode 100644 rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index b8b8a126b7c..e97c5c836b7 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -7,6 +7,7 @@ //! that stores tables as Lance datasets in a filesystem directory structure. pub mod manifest; +pub mod manifest_feature_flags; use arrow::array::Float32Array; use arrow::record_batch::RecordBatchIterator; @@ -682,6 +683,12 @@ impl DirectoryNamespaceBuilder { .await { Ok(ns) => Some(Arc::new(ns)), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // The manifest exists but was written with a feature flag this + // build does not understand. Refuse rather than silently + // degrading to a directory-listing view that ignores it. + return Err(e); + } Err(e) => { // Failed to initialize manifest namespace, fall back to directory listing only log::warn!( @@ -1372,6 +1379,11 @@ impl DirectoryNamespace { } return Ok(response); } + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } @@ -2599,6 +2611,11 @@ impl LanceNamespace for DirectoryNamespace { { match manifest_ns.table_exists(request.clone()).await { Ok(()) => return Ok(()), + Err(e) if manifest_feature_flags::is_incompatible_manifest_error(&e) => { + // An incompatible manifest must surface "please upgrade" + // rather than degrading to a directory-listing view. + return Err(e); + } Err(_) if self.dir_listing_enabled && is_root_level => { // Fall through to directory check only for single-level IDs } diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 4f3e53ba806..aae924378da 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -6,6 +6,7 @@ //! This module provides a namespace implementation that uses a manifest table //! to track tables and nested namespaces. +use super::manifest_feature_flags::{ensure_readable, ensure_writable}; use arrow::array::builder::{ListBuilder, StringBuilder}; use arrow::array::{Array, ListArray, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef}; @@ -607,26 +608,35 @@ impl DatasetConsistencyWrapper { /// Always reloads to ensure strong consistency. pub async fn get(&self) -> Result> { self.reload().await?; - Ok(DatasetReadGuard { + let guard = DatasetReadGuard { guard: self.0.read().await, - }) + }; + // Refuse manifests written with a reader feature flag this build does + // not understand instead of misreading them. + ensure_readable(guard.metadata())?; + Ok(guard) } /// Reload the dataset and return a reference. pub async fn get_refreshed(&self) -> Result> { self.reload().await?; - Ok(DatasetReadGuard { + let guard = DatasetReadGuard { guard: self.0.read().await, - }) + }; + ensure_readable(guard.metadata())?; + Ok(guard) } /// Get a mutable reference to the dataset. /// Always reloads to ensure strong consistency. pub async fn get_mut(&self) -> Result> { self.reload().await?; - Ok(DatasetWriteGuard { + let guard = DatasetWriteGuard { guard: self.0.write().await, - }) + }; + ensure_readable(guard.metadata())?; + ensure_writable(guard.metadata())?; + Ok(guard) } /// Provide a known latest version of the dataset. @@ -1845,6 +1855,16 @@ impl ManifestNamespace { } } + /// Validate that this build can write the current `__manifest` before a + /// mutating operation performs any side effect (e.g. writing table data), so + /// a refused write leaves nothing orphaned behind. The eventual + /// `rewrite_manifest` commit re-checks `ensure_writable` on each retry, so a + /// concurrent upgrade in between is still caught. + async fn ensure_manifest_writable(&self) -> Result<()> { + let dataset_guard = self.manifest_dataset.get().await?; + ensure_writable(dataset_guard.metadata()) + } + async fn rewrite_manifest( &self, operation: &str, @@ -1864,6 +1884,9 @@ impl ManifestNamespace { let dataset_guard = self.manifest_dataset.get_refreshed().await?; let dataset = Arc::new(dataset_guard.clone()); drop(dataset_guard); + // Refuse to mutate a manifest written with a writer feature flag this + // build does not understand. + ensure_writable(dataset.metadata())?; // Staged files, indices, the commit, and cleanup must all use the dataset's // own object store (see `commit_manifest_overwrite`). let object_store = dataset.object_store(None).await?; @@ -2408,6 +2431,10 @@ impl ManifestNamespace { .load() .await; if let Ok(mut dataset) = dataset_result { + // Reject a manifest written with a reader feature flag this build + // does not understand before touching it. + ensure_readable(dataset.metadata())?; + // Check if the object_id field has primary key metadata, migrate if not let needs_pk_migration = dataset .schema() @@ -2419,6 +2446,9 @@ impl ManifestNamespace { .unwrap_or(false); if needs_pk_migration { + // This legacy migration writes to the manifest, so confirm this + // build is allowed to write the current format first. + ensure_writable(dataset.metadata())?; log::info!("Migrating __manifest table to add primary key metadata on object_id"); dataset .update_field_metadata() @@ -2828,6 +2858,10 @@ impl LanceNamespace for ManifestNamespace { let (namespace, table_name) = Self::split_object_id(table_id); let object_id = Self::build_object_id(&namespace, &table_name); + // Refuse before writing any table data if this build cannot write the + // manifest, so a refused create leaves no orphaned dataset behind. + self.ensure_manifest_writable().await?; + let existing_table = self.query_manifest_for_table(&object_id).await?; let existing_has_manifests = if let Some(existing_table) = &existing_table { Some( @@ -3788,6 +3822,149 @@ mod tests { buffer } + /// Open the `__manifest` dataset directly and set a table-metadata key, + /// simulating a future Lance client that persisted a feature flag. + async fn set_manifest_table_metadata(temp_path: &str, key: &str, value: &str) { + use lance::dataset::builder::DatasetBuilder; + let mut ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() + .await + .unwrap(); + ds.update_metadata([(key, value)]).await.unwrap(); + } + + async fn create_namespace_with_one_table(temp_path: &str) { + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .unwrap(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t1".to_string()]); + ns.create_table(create_request, Bytes::from(create_test_ipc_data())) + .await + .unwrap(); + } + + /// This is a forward-compatibility checker only: it must not set any feature + /// flag, so existing clients keep treating the manifest as compatible. + #[tokio::test] + async fn test_manifest_has_no_feature_flags_by_default() { + use lance::dataset::builder::DatasetBuilder; + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + + let ds = DatasetBuilder::from_uri(format!("{}/{}", temp_path, MANIFEST_TABLE_NAME)) + .load() + .await + .unwrap(); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY) + ); + assert!( + !ds.metadata() + .contains_key(crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY) + ); + } + + /// An unknown reader feature flag must block opening the catalog with a clear + /// "please upgrade" error rather than silently degrading to directory listing. + #[tokio::test] + async fn test_unknown_reader_flag_blocks_access() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::READER_FEATURE_FLAGS_KEY, + "1", + ) + .await; + + let err = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect_err("opening a manifest with an unknown reader flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + } + + /// An unknown writer feature flag must still allow reads but block writes. + #[tokio::test] + async fn test_unknown_writer_flag_blocks_writes_but_allows_reads() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + create_namespace_with_one_table(temp_path).await; + set_manifest_table_metadata( + temp_path, + crate::dir::manifest_feature_flags::WRITER_FEATURE_FLAGS_KEY, + "1", + ) + .await; + + let ns = DirectoryNamespaceBuilder::new(temp_path) + .build() + .await + .expect("reads should still be allowed with only a writer flag set"); + let mut list_request = ListTablesRequest::new(); + list_request.id = Some(vec![]); + assert_eq!(ns.list_tables(list_request).await.unwrap().tables.len(), 1); + + // A refused write must not leave an orphaned table dataset behind. + let entries_before = dir_entry_names(temp_path); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["t2".to_string()]); + let err = ns + .create_table(create_request, Bytes::from(create_test_ipc_data())) + .await + .expect_err("writing through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + assert_eq!( + entries_before, + dir_entry_names(temp_path), + "a refused create_table must not create an orphaned table directory" + ); + + // Mutations that go straight through rewrite_manifest (no early + // create_table check) must also be refused: an insert (create_namespace) + // and a delete (drop_table). This proves the writer check is enforced at + // the single copy-on-write chokepoint, not just on the create_table path. + let mut create_ns = CreateNamespaceRequest::new(); + create_ns.id = Some(vec!["ns1".to_string()]); + let err = ns + .create_namespace(create_ns) + .await + .expect_err("create_namespace through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(vec!["t1".to_string()]); + let err = ns + .drop_table(drop_request) + .await + .expect_err("drop_table through an unknown writer flag should fail"); + assert!( + err.to_string().to_lowercase().contains("upgrade"), + "expected an upgrade error, got: {err}" + ); + } + + fn dir_entry_names(path: &str) -> std::collections::BTreeSet { + std::fs::read_dir(path) + .unwrap() + .map(|e| e.unwrap().file_name().to_string_lossy().into_owned()) + .collect() + } + #[tokio::test] async fn test_manifest_rewrite_preserves_utf8_metadata_and_base_objects() { let temp_dir = TempStdDir::default(); diff --git a/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs new file mode 100644 index 00000000000..d0849ceda4f --- /dev/null +++ b/rust/lance-namespace-impls/src/dir/manifest_feature_flags.rs @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Reader/writer feature flags for the directory-catalog `__manifest` dataset. +//! +//! Forward-compatibility infrastructure for the `__manifest` Lance dataset, +//! analogous to the Lance table format's `reader_feature_flags` / +//! `writer_feature_flags` but describing the *catalog manifest* format (schema +//! and semantics) rather than the underlying Lance file format. The flags are +//! persisted in the `__manifest` dataset's `table_metadata` map. +//! +//! Each manifest feature owns one bit in a `u64` bitmask. A build may read a +//! `__manifest` only if it understands every set reader-flag bit, and may write +//! it only if it understands every set writer-flag bit; otherwise it fails fast +//! with a clear "please upgrade" error instead of silently misreading data. The +//! set of bits a build understands is `READER_KNOWN_FLAGS` / `WRITER_KNOWN_FLAGS`. +//! +//! This is the mechanism only: no manifest feature is defined yet, so the known +//! masks are `0` and nothing is ever set — every current manifest reads and +//! writes unchanged. The first format change that needs forward-compatibility +//! protection adds its bit to the known masks and stamps it on write; from then +//! on, builds without that bit refuse the new format rather than misreading it. +//! Manifests written before this mechanism carry no flag keys, which parse as +//! `0` and stay compatible with every build. + +use std::collections::HashMap; + +use lance_core::{Error, Result}; +use lance_namespace::error::NamespaceError; + +/// `table_metadata` key holding the reader feature-flag bitmask (decimal `u64`). +pub const READER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.reader_feature_flags"; +/// `table_metadata` key holding the writer feature-flag bitmask (decimal `u64`). +pub const WRITER_FEATURE_FLAGS_KEY: &str = "lance.namespace.manifest.writer_feature_flags"; + +/// Reader feature-flag bits this build understands. No manifest feature is +/// defined yet, so this build understands none and refuses any non-zero reader +/// flag. A future format change adds its bit here. +const READER_KNOWN_FLAGS: u64 = 0; +/// Writer feature-flag bits this build understands. +const WRITER_KNOWN_FLAGS: u64 = 0; + +/// Whether this build can read a `__manifest` whose persisted reader feature +/// flags are `reader_flags` — i.e. it understands every set bit. +pub fn can_read_manifest(reader_flags: u64) -> bool { + (reader_flags & !READER_KNOWN_FLAGS) == 0 +} + +/// Whether this build can write a `__manifest` whose persisted writer feature +/// flags are `writer_flags` — i.e. it understands every set bit. +pub fn can_write_manifest(writer_flags: u64) -> bool { + (writer_flags & !WRITER_KNOWN_FLAGS) == 0 +} + +fn parse_flags(table_metadata: &HashMap, key: &str) -> Result { + match table_metadata.get(key) { + None => Ok(0), + Some(raw) => raw.parse::().map_err(|e| { + Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset has an unparsable feature-flag value '{raw}' for \ + '{key}': {e}. This likely means it was written by a newer, incompatible \ + version of Lance; please upgrade Lance to use this catalog." + ), + }) + }), + } +} + +/// Reader feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn reader_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, READER_FEATURE_FLAGS_KEY) +} + +/// Writer feature flags persisted in the `__manifest` `table_metadata` (`0` if absent). +pub fn writer_flags(table_metadata: &HashMap) -> Result { + parse_flags(table_metadata, WRITER_FEATURE_FLAGS_KEY) +} + +/// Validate that this build can READ the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_readable(table_metadata: &HashMap) -> Result<()> { + let flags = reader_flags(table_metadata)?; + if !can_read_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with reader feature flags {flags}, which this \ + version of Lance does not understand (known reader flags: {READER_KNOWN_FLAGS}). \ + Please upgrade Lance to read this catalog." + ), + })); + } + Ok(()) +} + +/// Validate that this build can WRITE the `__manifest` described by `table_metadata`, +/// returning a clear "please upgrade" error otherwise. +pub fn ensure_writable(table_metadata: &HashMap) -> Result<()> { + let flags = writer_flags(table_metadata)?; + if !can_write_manifest(flags) { + return Err(Error::from(NamespaceError::Unsupported { + message: format!( + "The __manifest dataset was written with writer feature flags {flags}, which this \ + version of Lance does not understand (known writer flags: {WRITER_KNOWN_FLAGS}). \ + Please upgrade Lance to modify this catalog." + ), + })); + } + Ok(()) +} + +/// Whether `err` indicates the `__manifest` is in a format this build cannot +/// handle — i.e. it carries an unknown reader/writer feature flag, surfaced by +/// [`ensure_readable`] / [`ensure_writable`] as a [`NamespaceError::Unsupported`]. +/// +/// Catalog initialization uses this to refuse opening such a manifest rather +/// than silently degrading to a directory-listing view that ignores it. The +/// `__manifest` open path raises no other `Unsupported` error, so matching the +/// code is sufficient and avoids brittle message matching. +pub fn is_incompatible_manifest_error(err: &Error) -> bool { + matches!( + err, + Error::Namespace { source, .. } + if source + .downcast_ref::() + .is_some_and(|e| matches!(e, NamespaceError::Unsupported { .. })) + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn meta(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + #[test] + fn unflagged_is_compatible() { + assert!(can_read_manifest(0)); + assert!(can_write_manifest(0)); + let empty = HashMap::new(); + assert!(ensure_readable(&empty).is_ok()); + assert!(ensure_writable(&empty).is_ok()); + assert_eq!(reader_flags(&empty).unwrap(), 0); + assert_eq!(writer_flags(&empty).unwrap(), 0); + // Explicit zeroes are also compatible. + let zeroed = meta(&[ + (READER_FEATURE_FLAGS_KEY, "0"), + (WRITER_FEATURE_FLAGS_KEY, "0"), + ]); + assert!(ensure_readable(&zeroed).is_ok()); + assert!(ensure_writable(&zeroed).is_ok()); + } + + #[test] + fn any_unknown_flag_is_refused() { + // This build understands no feature flags, so any non-zero bit is refused. + assert!(!can_read_manifest(1)); + assert!(!can_write_manifest(1)); + assert!(!can_read_manifest(1 << 30)); + assert!(!can_write_manifest(1 << 63)); + + let reader = meta(&[(READER_FEATURE_FLAGS_KEY, "1")]); + let err = ensure_readable(&reader).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + // A reader flag does not block writers that the writer mask allows. + assert!(ensure_writable(&reader).is_ok()); + + let writer = meta(&[(WRITER_FEATURE_FLAGS_KEY, "2")]); + let err = ensure_writable(&writer).unwrap_err(); + assert!(err.to_string().to_lowercase().contains("upgrade")); + assert!(is_incompatible_manifest_error(&err)); + } + + #[test] + fn unparsable_value_is_refused() { + let m = meta(&[(READER_FEATURE_FLAGS_KEY, "not-a-number")]); + assert!(reader_flags(&m).is_err()); + assert!(ensure_readable(&m).is_err()); + } + + #[test] + fn unrelated_error_is_not_an_incompatibility() { + let other = Error::from(NamespaceError::TableNotFound { + message: "x".to_string(), + }); + assert!(!is_incompatible_manifest_error(&other)); + } +} From af47bc53cfe84848c476770508d91fb65f264cc3 Mon Sep 17 00:00:00 2001 From: YueZhang <69956021+zhangyue19921010@users.noreply.github.com> Date: Thu, 18 Jun 2026 02:46:37 +0800 Subject: [PATCH 127/177] feat(java): expose RTree scalar index type to Java (#7291) Co-authored-by: zhangyue19921010 --- java/lance-jni/src/index.rs | 2 + .../main/java/org/lance/index/IndexType.java | 1 + .../lance/index/scalar/ScalarIndexParams.java | 6 +- .../java/org/lance/index/ScalarIndexTest.java | 78 +++++++++++++++++++ 4 files changed, 84 insertions(+), 3 deletions(-) diff --git a/java/lance-jni/src/index.rs b/java/lance-jni/src/index.rs index 1e533eed9fc..6cb64a05a81 100644 --- a/java/lance-jni/src/index.rs +++ b/java/lance-jni/src/index.rs @@ -173,6 +173,8 @@ fn determine_index_type<'local>( Some("ZONEMAP") } else if lower.contains("bloomfilter") { Some("BLOOM_FILTER") + } else if lower.contains("rtree") { + Some("RTREE") } else if lower.contains("ivfhnsw") { if lower.contains("sq") { Some("IVF_HNSW_SQ") diff --git a/java/src/main/java/org/lance/index/IndexType.java b/java/src/main/java/org/lance/index/IndexType.java index 3a03934effd..1fff86fc7e0 100644 --- a/java/src/main/java/org/lance/index/IndexType.java +++ b/java/src/main/java/org/lance/index/IndexType.java @@ -24,6 +24,7 @@ public enum IndexType { MEM_WAL(7), ZONEMAP(8), BLOOM_FILTER(9), + RTREE(10), VECTOR(100), IVF_FLAT(101), IVF_SQ(102), diff --git a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java index 345a55f20b2..b3408e2d68d 100644 --- a/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java +++ b/java/src/main/java/org/lance/index/scalar/ScalarIndexParams.java @@ -31,7 +31,7 @@ private ScalarIndexParams(Builder builder) { * Create a new ScalarIndexParams with the given index type and no parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @return ScalarIndexParams */ public static ScalarIndexParams create(String indexType) { @@ -42,7 +42,7 @@ public static ScalarIndexParams create(String indexType) { * Create a new ScalarIndexParams with the given index type and JSON parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") * @param jsonParams JSON string containing index-specific parameters * @return ScalarIndexParams */ @@ -58,7 +58,7 @@ public static class Builder { * Create a new builder for scalar index parameters. * * @param indexType the index type (e.g., "btree", "zonemap", "bitmap", "inverted", "labellist", - * "ngram") + * "ngram", "rtree") */ public Builder(String indexType) { this.indexType = indexType; diff --git a/java/src/test/java/org/lance/index/ScalarIndexTest.java b/java/src/test/java/org/lance/index/ScalarIndexTest.java index b993a7e8a5f..cb090e7c955 100644 --- a/java/src/test/java/org/lance/index/ScalarIndexTest.java +++ b/java/src/test/java/org/lance/index/ScalarIndexTest.java @@ -25,14 +25,18 @@ import org.apache.arrow.c.Data; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.ipc.ArrowStreamReader; import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.FloatingPointPrecision; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -318,4 +322,78 @@ public void testCreateZonemapIndex(@TempDir Path tempDir) throws Exception { } } } + + @Test + public void testCreateRTreeIndex(@TempDir Path tempDir) throws Exception { + String datasetPath = tempDir.resolve("rtree_test").toString(); + ArrowType f64 = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + Field geometryField = + new Field( + "geometry", + new FieldType( + true, + new ArrowType.Struct(), + null, + Collections.singletonMap("ARROW:extension:name", "geoarrow.point")), + Arrays.asList(Field.notNullable("x", f64), Field.notNullable("y", f64))); + Schema schema = new Schema(Collections.singletonList(geometryField), null); + + int rowCount = 3; + try (RootAllocator allocator = new RootAllocator(); + VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + root.allocateNew(); + StructVector geometry = (StructVector) root.getVector("geometry"); + Float8Vector x = (Float8Vector) geometry.getChild("x"); + Float8Vector y = (Float8Vector) geometry.getChild("y"); + for (int i = 0; i < rowCount; i++) { + geometry.setIndexDefined(i); + x.setSafe(i, (double) i); + y.setSafe(i, i * 2.0); + } + geometry.setValueCount(rowCount); + root.setRowCount(rowCount); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) { + writer.start(); + writer.writeBatch(); + writer.end(); + } + + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator); + Dataset dataset = + Dataset.write() + .reader(reader) + .uri(datasetPath) + .allocator(allocator) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + // The point data round-trips through Lance. + assertEquals(rowCount, dataset.countRows()); + try (ArrowReader scan = dataset.newScan(new ScanOptions.Builder().build()).scanBatches()) { + assertTrue(scan.loadNextBatch()); + StructVector readGeometry = + (StructVector) scan.getVectorSchemaRoot().getVector("geometry"); + assertEquals(2.0, ((Float8Vector) readGeometry.getChild("x")).get(2)); + assertEquals(4.0, ((Float8Vector) readGeometry.getChild("y")).get(2)); + } + + // Creating and listing an RTree index via the typed IndexType works end-to-end. + Index index = + dataset.createIndex( + Collections.singletonList("geometry"), + IndexType.RTREE, + Optional.of("rtree_geometry_index"), + IndexParams.builder() + .setScalarIndexParams(ScalarIndexParams.create("rtree")) + .build(), + true); + assertEquals(IndexType.RTREE, index.indexType()); + assertTrue( + dataset.listIndexes().contains("rtree_geometry_index"), + "Expected 'rtree_geometry_index' in: " + dataset.listIndexes()); + } + } + } } From ae74092c529d9a26246501ffeca8c2de03c01267 Mon Sep 17 00:00:00 2001 From: XY Zhan Date: Wed, 17 Jun 2026 14:47:19 -0400 Subject: [PATCH 128/177] fix(fts): keep inverted index correct under deletions in the fragment-reuse window (#7325) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary A deferred-remap compaction that materializes deletions writes a fragment-reuse index (FRI) that the inverted (full-text-search) index is read through at load time. The load-time path dropped the deleted rows and renumbered the surviving `doc_id`s — but the posting lists reference `doc_id`s **positionally** (a `doc_id` is an index into the `DocSet`'s `row_ids` / `num_tokens` arrays, fixed at build time) and are not regenerated at load. Dropping rows shifted every later `doc_id` out from under the posting lists, so a query would index `num_tokens` / `row_ids` out of bounds (panic) or score/return the wrong document. This is deletion-specific: merge-only deferred compaction remaps every row to `Some(new_addr)`, so nothing is dropped and positions stay aligned. It only breaks when deletions are materialized (`remap_row_id` returns `None`). ## Fix: tombstone-preserve-positions In `DocSet::from_columns` (the FRI load path), instead of dropping deleted rows: - keep every doc slot so `doc_id`s stay aligned with the posting lists; - put `RowAddress::TOMBSTONE_ROW` in the deleted slots, and leave them out of the `inv` reverse map so a `row_id` lookup never resolves to a deleted doc; - keep `num_tokens` full-length, so `num_tokens(doc_id)` can't go out of bounds. In `Wand::search`, skip docs whose resolved `row_id` is `TOMBSTONE_ROW` — placed right beside the existing prefilter-mask skip and using the same iterator-advance, so a tombstoned doc is stepped over exactly like a prefilter-rejected one and never surfaces in results. The heavyweight physical remap (`DocSet::remap`) still does the real renumber + compact (and rebuilds the posting lists to match); this load-time path only needs to stay consistent until then. ### Note on stats Tombstoned slots are still counted in `total_tokens` / `len()`, so BM25 `avgdl` in the FRI window is effectively the pre-deletion average. This only perturbs *scores* slightly, never the result set, and the physical remap restores exact stats. Excluding tombstones would require changing `len()` semantics (used by `idf`), which isn't worth it for a transient window. ## Test `test_read_inverted_index_with_defer_index_remap_and_deletions`: delete a prefix, deferred-compact, then assert FTS returns exactly the surviving rows — both in the FRI window and after physical remap + trim. Without the fix it panics on the out-of-bounds `num_tokens` access. ## Scope Independent change against `main` — touches only the inverted-index load and query paths (`scalar/inverted/{index,wand}.rs`) plus one new test. The analogous IVF_HNSW desync under deletions (#3993) is **not** addressed here: the HNSW graph traverses node ids positionally to compute distances *during* search (not just at result collection), so it needs a different approach across the SQ/PQ/flat/RQ storage load paths — a separate change. --- rust/lance-index/src/scalar/inverted/index.rs | 40 ++++-- rust/lance-index/src/scalar/inverted/wand.rs | 9 ++ rust/lance/src/dataset/optimize.rs | 127 ++++++++++++++++++ 3 files changed, 163 insertions(+), 13 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 8e662f5db6f..1082e1dc371 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -42,6 +42,7 @@ use lance_arrow::{RecordBatchExt, iter_str_array}; use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache}; use lance_core::deepsize::DeepSizeOf; use lance_core::error::{DataFusionResult, LanceOptionExt}; +use lance_core::utils::address::RowAddress; use lance_core::utils::tokio::{get_num_compute_intensive_cpus, spawn_cpu}; use lance_core::utils::tracing::{IO_TYPE_LOAD_SCALAR_PART, TRACE_IO_EVENTS}; use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; @@ -4749,23 +4750,36 @@ impl DocSet { }); } - // if frag reuse happened, we'll need to remap the row_ids. And after row_ids been - // remapped, we'll need resort to make sure binary_search works. + // If frag reuse happened, remap the row_ids through it. Crucially we + // must NOT drop the rows the reuse index deleted, because the posting + // lists reference doc_ids *positionally* (a doc_id is an index into + // these arrays, fixed at build time). Dropping deleted rows would + // renumber every later doc_id and desync the posting lists, so wand + // would index `num_tokens`/`row_ids` out of bounds or score the wrong + // doc. Instead we tombstone deleted rows in place: their slot survives + // (so doc_ids stay aligned with the posting lists) carrying + // `RowAddress::TOMBSTONE_ROW`, which wand skips, and they are left out + // of `inv` so a row_id lookup never resolves to a deleted doc. The + // heavyweight physical remap (`DocSet::remap`) is what actually + // renumbers and compacts; this load-time path only has to stay + // consistent until then. if let Some(frag_reuse_index_ref) = frag_reuse_index.as_ref() { let mut row_ids = Vec::with_capacity(row_id_col.len()); - let mut num_tokens = Vec::with_capacity(num_tokens_col.len()); - for (row_id, num_token) in row_id_col.values().iter().zip(num_tokens_col.values()) { - if let Some(new_row_id) = frag_reuse_index_ref.remap_row_id(*row_id) { - row_ids.push(new_row_id); - num_tokens.push(*num_token); + let num_tokens = num_tokens_col.values().to_vec(); + let mut inv = Vec::with_capacity(row_id_col.len()); + for (doc_id, row_id) in row_id_col.values().iter().enumerate() { + match frag_reuse_index_ref.remap_row_id(*row_id) { + Some(new_row_id) => { + row_ids.push(new_row_id); + inv.push((new_row_id, doc_id as u32)); + } + None => { + // Deleted: keep the slot (doc_ids must not shift) but + // tombstone it and leave it out of `inv`. + row_ids.push(RowAddress::TOMBSTONE_ROW); + } } } - - let mut inv: Vec<(u64, u32)> = row_ids - .iter() - .enumerate() - .map(|(doc_id, row_id)| (*row_id, doc_id as u32)) - .collect(); inv.sort_unstable_by_key(|entry| entry.0); let total_tokens = num_tokens.iter().map(|&x| x as u64).sum(); diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 609ec08041f..259de6ee06f 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -736,6 +736,15 @@ impl<'a, S: Scorer> Wand<'a, S> { } DocInfo::Located(doc) => doc.row_id, }; + // Skip docs the fragment-reuse remap deleted. They are tombstoned + // in the DocSet (slot kept so posting-list doc_ids stay aligned) + // and must not surface in results. + if docs_has_row_ids && row_id == RowAddress::TOMBSTONE_ROW { + if self.operator == Operator::Or { + self.push_back_leads(doc.doc_id() + 1); + } + continue; + } if docs_has_row_ids && !mask.selected(row_id) { if self.operator == Operator::Or { self.push_back_leads(doc.doc_id() + 1); diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 274fde9955e..87dda8e7e57 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -4307,6 +4307,133 @@ mod tests { assert_eq!(scanner.count_rows().await.unwrap(), count3); } + /// Deferred compaction that materializes deletions must not corrupt an + /// inverted (FTS) index read through the fragment-reuse index. The index's + /// posting lists reference doc_ids positionally; if the load-time remap + /// dropped the deleted rows it would renumber the doc_ids and desync the + /// posting lists (out-of-bounds `num_tokens`, wrong/stale row ids). The + /// tombstone-preserve-positions load path must keep results correct in the + /// FRI window and after the physical remap + trim. + #[tokio::test] + async fn test_read_inverted_index_with_defer_index_remap_and_deletions() { + // Enough surviving docs for several compressed posting-list blocks + // (BLOCK_SIZE = 128), split across several fragments so compaction has + // real work — but no larger. + const ROWS: i32 = 1200; + const DELETED: i32 = 400; + + // Every row contains "lance", so the term matches all live rows; `id` + // tells us exactly which rows survive. + let ids = Int32Array::from_iter_values(0..ROWS); + let docs = LargeStringArray::from_iter_values((0..ROWS).map(|_| "lance apple orange")); + let batch = RecordBatch::try_new( + Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("doc", DataType::LargeUtf8, false), + ]) + .into(), + vec![Arc::new(ids) as ArrayRef, Arc::new(docs) as ArrayRef], + ) + .unwrap(); + let schema_ref = batch.schema(); + let stream = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema_ref); + let mut dataset = Dataset::write( + stream, + "memory://test/table", + Some(WriteParams { + max_rows_per_file: 200, // 6 fragments + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["doc"], + IndexType::Inverted, + Some("doc_idx".into()), + &InvertedIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Delete a prefix, then deferred-compact so the deletions are + // materialized into the fragment-reuse index the index is read through. + dataset.delete(&format!("id < {DELETED}")).await.unwrap(); + compact_files( + &mut dataset, + CompactionOptions { + target_rows_per_fragment: 2_000, + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + assert!( + dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .is_some(), + "deferred compaction must leave a fragment-reuse index" + ); + + // FTS "lance" → sorted surviving ids. Projecting `id` forces a take, so + // a stale row address would error or return a wrong/dead row. + async fn search_ids(dataset: &Dataset) -> Vec { + let mut scanner = dataset.scan(); + scanner + .full_text_search(FullTextSearchQuery::new("lance".to_owned())) + .unwrap(); + scanner.project::<&str>(&["id"]).unwrap(); + let batches = scanner + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap(); + let mut ids: Vec = batches + .iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect(); + ids.sort_unstable(); + ids + } + + let expected = (DELETED..ROWS).collect::>(); + + // FRI window: index read through the reuse index. + let during = search_ids(&dataset).await; + assert_eq!( + during, expected, + "FRI-window FTS must return exactly the surviving rows (no resurrection, no loss, no stale rows)" + ); + + // Physical remap + trim: must still be correct. + remapping::remap_column_index(&mut dataset, &["doc"], Some("doc_idx".into())) + .await + .unwrap(); + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let after = search_ids(&dataset).await; + assert_eq!( + after, expected, + "FTS must stay correct after physical remap + fragment-reuse trim" + ); + } + #[tokio::test] async fn test_read_ngram_index_with_defer_index_remap() { // Generate random words using lance-datagen From 113f5134d037c377313e20b60a2cfa451332cf5d Mon Sep 17 00:00:00 2001 From: EJ Song <51077614+sezruby@users.noreply.github.com> Date: Wed, 17 Jun 2026 11:47:29 -0700 Subject: [PATCH 129/177] test(python): use non-deprecated params in search benchmark fixtures (#7285) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The dataset fixtures in `python/benchmarks/test_search.py` pass deprecated parameters to the dataset APIs: - `lance.write_dataset(..., use_legacy_format=False)` - `lance.dataset(..., index_cache_size=64 * 1024)` The test config sets `filterwarnings = ['error::DeprecationWarning', ...]`, so these emit `DeprecationWarning` **as errors during fixture setup**. As a result every benchmark in `test_search.py` errors out before running: ``` DeprecationWarning: use_legacy_format is deprecated, use data_storage_version instead DeprecationWarning: The 'index_cache_size' parameter is deprecated. Use 'index_cache_size_bytes' instead. ``` ## Fix Switch to the current parameters: - `use_legacy_format=False` → `data_storage_version="stable"` (the exact mapping the deprecation shim applies). - `index_cache_size=64 * 1024` → `index_cache_size_bytes=512 * 1024 * 1024` (512 MiB comfortably caches these 100k-row IVF_PQ indices). ## Verification ``` $ uv run --group benchmarks pytest python/benchmarks/test_search.py::test_ann_no_refine --benchmark-only test_ann_no_refine[clean] 541.48 us 1813 ops test_ann_no_refine[with_delete_files] 770.46 us 1285 ops test_ann_no_refine[with_new_rows] 2843.17 us 349 ops 3 passed ``` `ruff check` / `ruff format --check` clean. Co-authored-by: Claude Opus 4.8 (1M context) --- python/python/benchmarks/test_search.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/python/benchmarks/test_search.py b/python/python/benchmarks/test_search.py index 61076e61687..b4e33338cb1 100644 --- a/python/python/benchmarks/test_search.py +++ b/python/python/benchmarks/test_search.py @@ -78,10 +78,12 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: rows_remaining -= next_batch_length table = create_table(next_batch_length, offset) if offset == 0: - dataset = lance.write_dataset(table, tmp_path, use_legacy_format=False) + dataset = lance.write_dataset( + table, tmp_path, data_storage_version="stable" + ) else: dataset = lance.write_dataset( - table, tmp_path, mode="append", use_legacy_format=False + table, tmp_path, mode="append", data_storage_version="stable" ) offset += next_batch_length @@ -98,7 +100,7 @@ def create_base_dataset(data_dir: Path) -> lance.LanceDataset: dataset.create_scalar_index("category", "BITMAP") dataset.create_scalar_index("genres", "LABEL_LIST") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_delete_dataset(data_dir): @@ -113,7 +115,7 @@ def create_delete_dataset(data_dir): dataset = lance.dataset(tmp_path) dataset.delete("filterable % 2 != 0") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) def create_new_rows_dataset(data_dir): @@ -129,7 +131,7 @@ def create_new_rows_dataset(data_dir): table = create_table(NEW_ROWS, offset=NUM_ROWS) dataset = lance.write_dataset(table, tmp_path, mode="append") - return lance.dataset(tmp_path, index_cache_size=64 * 1024) + return lance.dataset(tmp_path, index_cache_size_bytes=512 * 1024 * 1024) class Datasets(NamedTuple): From 34b54741715e3873dee3cff12ccee11e269f91de Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Wed, 17 Jun 2026 13:48:23 -0500 Subject: [PATCH 130/177] fix: dedup active-memtable predicate-crossing stale reads (vector + FTS) (#7067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Fixes a stale-read phantom shared by the **vector** and **FTS** index search arms over the MemWAL **active memtable**, and routes the in-memory newest-per-PK / membership decisions through a single maintained MVCC index. ## The bug The active memtable is an append log; a PK update is a later append with the same key. The in-memory secondary indexes — HNSW (vector) and the inverted index (FTS) — are **append-only**, so an updated row's old entries stay live. Both arms deduped with `WithinSourceDedupExec`, which only suppresses a stale row when the fresh version is **also in the result set**. When an update moves a row out of the query's match set (vector: far from the query; FTS: new text no longer matches), the fresh version isn't returned, so the stale version leaks. (`point_lookup` was immune — it already did the MVCC recency seek.) ## The fix Maintain a per-memtable **MVCC PK-position index**: a lock-free arena skiplist keyed on `(compute_pk_hash(pk_columns), row_position)`, enabled on the active memtable and carried through freeze. The row position *is* the version stamp, so this reuses the exact primitive point-lookup trusts (`get_newest_visible`). - **`NewestPkFilterExec`** keeps an index hit iff `get_newest_visible(pk_hash, max_visible) == row_position` — predicate-independent, snapshot-exact (keys on the scanner's latched `max_visible`). Wired into the active vector arm (replacing `WithinSourceDedupExec`) and the FTS arm (adding `with_row_id`). - **point_lookup** falls back to the index (hash + value-equality collision guard) when no scalar BTree exists; its plan-path active arm uses `SortExec(_rowid DESC).fetch(1)` instead of `WithinSourceDedupExec`. - **Cross-source block-list** probes the index per candidate (`GenMembership::Index`, snapshot-bounded) with no per-query set; flushed/base keep cached sets. `contains_pks` probes too. - **Cleanup:** `WithinSourceDedupExec` / `DedupDirection` and the per-query PK-hash set builders (`pk_hashes()`, `in_memory_pk_hashes`) are deleted. Net negative LOC. Hash keying covers single **and composite** PKs uniformly. The snapshot-bounded probe also closes a latent over-block where a not-yet-visible newer write could shadow an older visible copy. ## Tests Both `#[ignore]`d repros un-ignored and passing; new `PkPositionIndex` unit tests, point-lookup-without-btree, index-sourced block-list, and snapshot-bounded **vanished-row guard** tests (within- and cross-source). Full `mem_wal` suite green; `cargo fmt` + `clippy -D warnings` clean. ## Deferred follow-ups - Migrate `MemTableDedupScanExec`'s reverse-walk `HashSet` (filtered-read scan path) onto the same probe — the last within-source mechanism off the index; benchmark-gated. - In-graph HNSW within-gen eviction (perf end-game; correctness is now exact). 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- java/lance-jni/src/mem_wal.rs | 37 + .../java/org/lance/memwal/MemWalTest.java | 29 +- python/python/tests/test_mem_wal.py | 7 + python/src/lib.rs | 1 + python/src/mem_wal.rs | 25 + rust/lance-index/src/scalar/btree.rs | 140 ++++ rust/lance-index/src/scalar/btree/flat.rs | 46 +- rust/lance/src/dataset/mem_wal/index.rs | 294 ++++++- .../lance/src/dataset/mem_wal/index/pk_key.rs | 204 +++++ .../dataset/mem_wal/memtable/batch_store.rs | 47 ++ .../src/dataset/mem_wal/memtable/flush.rs | 249 ++++++ .../mem_wal/memtable/scanner/builder.rs | 8 + rust/lance/src/dataset/mem_wal/scanner.rs | 1 + .../src/dataset/mem_wal/scanner/block_list.rs | 720 ++++++++++-------- .../src/dataset/mem_wal/scanner/builder.rs | 37 +- .../lance/src/dataset/mem_wal/scanner/exec.rs | 12 +- .../mem_wal/scanner/exec/newest_pk_filter.rs | 393 ++++++++++ .../src/dataset/mem_wal/scanner/exec/pk.rs | 2 +- .../mem_wal/scanner/exec/pk_block_filter.rs | 373 +++++++++ .../mem_wal/scanner/exec/pk_hash_filter.rs | 350 --------- .../scanner/exec/within_source_dedup.rs | 432 ----------- .../dataset/mem_wal/scanner/flushed_cache.rs | 59 +- .../src/dataset/mem_wal/scanner/fts_search.rs | 148 +++- .../src/dataset/mem_wal/scanner/planner.rs | 101 ++- .../dataset/mem_wal/scanner/point_lookup.rs | 172 ++++- .../dataset/mem_wal/scanner/vector_search.rs | 238 ++++-- rust/lance/src/dataset/mem_wal/util.rs | 10 + rust/lance/src/dataset/mem_wal/write.rs | 50 +- 28 files changed, 2860 insertions(+), 1325 deletions(-) create mode 100644 rust/lance/src/dataset/mem_wal/index/pk_key.rs create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs create mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs delete mode 100644 rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs diff --git a/java/lance-jni/src/mem_wal.rs b/java/lance-jni/src/mem_wal.rs index 9ba3fdd7440..20404b6a88b 100644 --- a/java/lance-jni/src/mem_wal.rs +++ b/java/lance-jni/src/mem_wal.rs @@ -27,6 +27,7 @@ use jni::sys::{jdouble, jint, jlong}; use lance::dataset::Dataset as LanceDataset; use lance::dataset::mem_wal::scanner::{ FlushedGeneration, LsmDataSourceCollector, LsmPointLookupPlanner, LsmVectorSearchPlanner, + write_pk_sidecar, }; use lance::dataset::mem_wal::write::{MemTableStats, WriteStatsSnapshot}; use lance::dataset::mem_wal::{ @@ -180,6 +181,42 @@ fn inner_put(env: &mut JNIEnv, this: JObject, stream_addr: jlong) -> Result<()> Ok(()) } +/// Test-support: write a primary-key dedup sidecar (`_pk_index/`) for a +/// flushed-generation dataset already staged at `gen_path`, mirroring what +/// production flush emits. Lets Java tests stage a *faithful* flushed +/// generation (dataset + sidecar); production always writes the sidecar during +/// flush, so a dataset-without-sidecar is not a state the system produces. +/// Mirrors the Python `_write_pk_sidecar` binding. +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_memwal_MemWalTest_nativeWritePkSidecar( + mut env: JNIEnv, + _class: JClass, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) { + ok_or_throw_without_return!( + env, + inner_write_pk_sidecar(&mut env, gen_path, stream_addr, pk_columns) + ); +} + +fn inner_write_pk_sidecar( + env: &mut JNIEnv, + gen_path: JString, + stream_addr: jlong, + pk_columns: JObject, +) -> Result<()> { + let gen_path: String = env.get_string(&gen_path)?.into(); + let pk_columns = env.get_strings(&pk_columns)?; + let stream_ptr = stream_addr as *mut FFI_ArrowArrayStream; + let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; + let batches: Vec = reader.collect::>()?; + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + RT.block_on(write_pk_sidecar(&gen_path, &batches, &pk_refs))?; + Ok(()) +} + #[unsafe(no_mangle)] pub extern "system" fn Java_org_lance_memwal_ShardWriter_nativeStats<'local>( mut env: JNIEnv<'local>, diff --git a/java/src/test/java/org/lance/memwal/MemWalTest.java b/java/src/test/java/org/lance/memwal/MemWalTest.java index ee26932dd59..5af3bd3f474 100644 --- a/java/src/test/java/org/lance/memwal/MemWalTest.java +++ b/java/src/test/java/org/lance/memwal/MemWalTest.java @@ -50,6 +50,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; import java.util.UUID; @@ -142,6 +143,30 @@ private static Dataset writeAppendOnlyDataset( } } + /** + * Stage a faithful flushed generation at {@code genPath}: the Lance dataset plus its + * primary-key dedup sidecar ({@code _pk_index/}), mirroring what production flush emits. The LSM + * scanner's cross-generation block-list opens the sidecar, so a dataset alone (no sidecar) is not + * a state production produces. Mirrors the Python {@code _write_flushed_gen} test helper. + */ + private static void writeFlushedGen( + BufferAllocator allocator, String genPath, long[] ids, String prefix) throws Exception { + writeLookupDataset(allocator, genPath, ids, prefix).close(); + try (VectorSchemaRoot root = lookupRoot(allocator, ids, prefix); + ArrowReader reader = toReader(allocator, root); + ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, stream); + nativeWritePkSidecar(genPath, stream.memoryAddress(), Collections.singletonList("id")); + } + } + + /** + * Test-support native: write the primary-key dedup sidecar for a flushed-generation dataset + * already staged at {@code genPath}. See {@link #writeFlushedGen}. + */ + private static native void nativeWritePkSidecar( + String genPath, long streamAddress, List pkColumns); + /** Read an LSM scanner fully into an {@code id -> name} map. */ private static Map readByName(ArrowReader reader) throws Exception { Map byId = new HashMap<>(); @@ -367,7 +392,7 @@ void testLsmScannerFromSnapshots(@TempDir Path tempDir) throws Exception { // Flushed generation overwrites id=2. String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); @@ -393,7 +418,7 @@ void testPointLookup(@TempDir Path tempDir) throws Exception { dataset.initializeMemWal(new InitializeMemWalParams()); String genPath = basePath + "/_mem_wal/" + shardId + "/gen_1"; - writeLookupDataset(allocator, genPath, new long[] {2}, "gen1").close(); + writeFlushedGen(allocator, genPath, new long[] {2}, "gen1"); ShardSnapshot snapshot = new ShardSnapshot(shardId).withFlushedGeneration(1, "gen_1").withCurrentGeneration(2); diff --git a/python/python/tests/test_mem_wal.py b/python/python/tests/test_mem_wal.py index b8c859cb637..c21e88b2416 100644 --- a/python/python/tests/test_mem_wal.py +++ b/python/python/tests/test_mem_wal.py @@ -60,9 +60,16 @@ def _write_flushed_gen(base_path: str, shard_id: str, gen_folder: str, data: pa. The collector resolves flushed generation paths as: {base_dataset_path}/_mem_wal/{shard_id}/{gen_folder} + + Production flush also writes a primary-key dedup sidecar (`_pk_index/`) that + the LSM scanner opens to dedup across generations; stage it here too so the + flushed generation faithfully matches what flush produces. """ + from lance.lance import _write_pk_sidecar + gen_path = os.path.join(base_path, "_mem_wal", shard_id, gen_folder) lance.write_dataset(data, gen_path, schema=_LOOKUP_SCHEMA) + _write_pk_sidecar(gen_path, data, ["id"]) def test_point_lookup_with_memtables(tmp_path): diff --git a/python/src/lib.rs b/python/src/lib.rs index cf29b26c46a..3bf4eab221e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -293,6 +293,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_wrapped(wrap_pyfunction!(mem_wal::py_evaluate_sharding_spec))?; + m.add_wrapped(wrap_pyfunction!(mem_wal::py_write_pk_sidecar))?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/mem_wal.rs b/python/src/mem_wal.rs index 25127c95ea4..dc9718c0dce 100644 --- a/python/src/mem_wal.rs +++ b/python/src/mem_wal.rs @@ -51,6 +51,31 @@ pub fn py_evaluate_sharding_spec<'py>( result.to_pyarrow(py) } +/// Write a primary-key dedup sidecar (`_pk_index/`) for a flushed-generation +/// dataset already written at `gen_path`, mirroring what production flush emits. +/// +/// Test-support only: lets Python tests stage a *faithful* flushed generation +/// (dataset + sidecar). Production always writes the sidecar during flush, so a +/// dataset-without-sidecar is not a state the system otherwise produces. +#[pyfunction(name = "_write_pk_sidecar", signature = (gen_path, data, pk_columns))] +pub fn py_write_pk_sidecar( + py: Python<'_>, + gen_path: String, + data: &Bound<'_, PyAny>, + pk_columns: Vec, +) -> PyResult<()> { + let reader = ArrowArrayStreamReader::from_pyarrow_bound(data) + .map_err(|e| PyValueError::new_err(format!("Cannot read data as Arrow: {}", e)))?; + let batches: Vec = reader + .collect::>() + .map_err(|e| PyIOError::new_err(format!("Failed to read batches: {}", e)))?; + rt().block_on(Some(py), async move { + let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); + lance::dataset::mem_wal::scanner::write_pk_sidecar(&gen_path, &batches, &pk_refs).await + })? + .map_err(|e: lance::Error| PyIOError::new_err(e.to_string())) +} + fn sharding_spec_from_py(spec: &Bound<'_, PyAny>) -> PyResult { let spec_id = get_py_value(spec, "spec_id")?.extract::()?; let fields_obj = get_py_value(spec, "fields")?; diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 9abd69022c0..c46702760cc 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1550,6 +1550,66 @@ impl BTreeIndex { } } + /// For each key in `keys`, whether this index contains it — a batched + /// existence check returning a mask aligned to `keys`. + /// + /// The per-key sibling of `search(Equals(..))`, but one call replaces N + /// probes: keys are grouped by page using the same page resolution as + /// [`ScalarIndex::search`] (`pages_eq`), each touched page is loaded once + /// (session-cached), and membership is tested against the page's values via + /// `FlatIndex::contains_values`. Avoids the per-key `SearchResult` / + /// `RowAddrTreeMap` allocation when the caller only wants a yes/no. + /// + /// Intended for primary-key dedup, where keys are non-null; a null key maps + /// to `false`. + pub async fn contains_keys( + &self, + keys: &[ScalarValue], + metrics: &dyn MetricsCollector, + ) -> Result> { + // Group each key (by input position) under every page whose value range + // could hold it. Mirrors `search`'s page selection so the two agree. + let mut by_page: HashMap> = HashMap::new(); + for (idx, key) in keys.iter().enumerate() { + if key.is_null() { + continue; + } + let ov = OrderableScalarValue(key.clone()); + for matches in self.page_lookup.pages_eq(&ov)? { + by_page + .entry(matches.page_id()) + .or_default() + .push((idx, ov.clone())); + } + } + + let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); + let page_tasks = by_page.into_iter().map(|(page_number, entries)| { + let index_reader = index_reader.clone(); + async move { + let page = self.lookup_page(page_number, index_reader, metrics).await?; + let needles: Vec = + entries.iter().map(|(_, ov)| ov.clone()).collect(); + let present = page.contains_values(&needles)?; + Result::Ok((entries, present)) + } + }); + + let mut result = vec![false; keys.len()]; + let page_results: Vec<_> = stream::iter(page_tasks) + .buffer_unordered(get_num_compute_intensive_cpus()) + .try_collect() + .await?; + for (entries, present) in page_results { + for (idx, ov) in entries { + if present.contains(&ov) { + result[idx] = true; + } + } + } + Ok(result) + } + async fn lookup_page( &self, page_number: u32, @@ -3429,6 +3489,86 @@ mod tests { } } + #[tokio::test] + async fn test_contains_keys_matches_search() { + let tmpdir = TempObjDir::default(); + let test_store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // 1000 distinct Int32 values [0, 1000), spread across many small pages + // (batch_size 64) so the keys below exercise multi-page grouping. + let data = gen_batch() + .col("value", array::step::()) + .col("_rowid", array::step::()) + .into_df_exec(RowCount::from(100), BatchCount::from(10)); + let schema = data.schema(); + let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap()); + let plan = Arc::new(SortExec::new([sort_expr].into(), data)); + let stream = plan.execute(0, Arc::new(TaskContext::default())).unwrap(); + let stream = break_stream(stream, 64); + let stream = stream.map_err(DataFusionError::from); + let stream = + Box::pin(RecordBatchStreamAdapter::new(schema, stream)) as SendableRecordBatchStream; + + train_btree_index(stream, test_store.as_ref(), 64, None, None) + .await + .unwrap(); + let index = BTreeIndex::load(test_store, None, &LanceCache::no_cache()) + .await + .unwrap(); + + // Present (range ends, mid, and adjacent values that straddle page + // boundaries), interleaved with absent (below/above range, and a gap). + let keys: Vec = vec![0, 999, 500, 1, 998, -1, 1000, 1500, 250, 251, 7, 64, 63, 65]; + let scalar_keys: Vec = + keys.iter().map(|k| ScalarValue::Int32(Some(*k))).collect(); + + let batched = index + .contains_keys(&scalar_keys, &NoOpMetricsCollector) + .await + .unwrap(); + + // Oracle: the per-key Equals search the batched path replaces. + let mut oracle = Vec::with_capacity(keys.len()); + for k in &scalar_keys { + let result = index + .search(&SargableQuery::Equals(k.clone()), &NoOpMetricsCollector) + .await + .unwrap(); + oracle.push(!result.row_addrs().is_empty()); + } + assert_eq!( + batched, oracle, + "contains_keys must agree with per-key Equals search; keys={keys:?}" + ); + + // And both must match ground truth: [0, 1000) present, others absent. + let expected: Vec = keys.iter().map(|k| (0..1000).contains(k)).collect(); + assert_eq!(batched, expected); + + // Empty input → empty mask. + assert!( + index + .contains_keys(&[], &NoOpMetricsCollector) + .await + .unwrap() + .is_empty() + ); + + // A null key maps to false (and must not panic). + let with_null = vec![ScalarValue::Int32(Some(5)), ScalarValue::Int32(None)]; + assert_eq!( + index + .contains_keys(&with_null, &NoOpMetricsCollector) + .await + .unwrap(), + vec![true, false] + ); + } + #[tokio::test] async fn test_page_cache() { let tmpdir = TempObjDir::default(); diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 045b4c95c55..7663d8478c1 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::{ops::Bound, sync::Arc}; use arrow_array::Array; @@ -21,8 +21,10 @@ use lance_select::{NullableRowAddrSet, RowAddrTreeMap, RowSetOps}; use roaring::RoaringBitmap; use tracing::instrument; +use datafusion_common::ScalarValue; + use crate::metrics::MetricsCollector; -use crate::scalar::btree::BTREE_VALUES_COLUMN; +use crate::scalar::btree::{BTREE_VALUES_COLUMN, OrderableScalarValue}; use crate::scalar::{AnyQuery, SargableQuery}; const VALUES_COL_IDX: usize = 0; @@ -82,6 +84,46 @@ impl FlatIndex { self.data.column(IDS_COL_IDX) } + fn values(&self) -> &ArrayRef { + self.data.column(VALUES_COL_IDX) + } + + /// Which of `needles` are present in this page. + /// + /// Batched existence sibling of [`Self::search`]: it runs the same `IsIn` + /// predicate over the page's `values` column, but returns the matched + /// *values* rather than row addresses — so the caller can map each result + /// back to the input key it asked about. The page scan stays vectorized; + /// only the (small) matched subset is lifted into `ScalarValue`. + /// + /// Nulls: a null `values` entry never matches a (non-null) primary-key + /// needle, so it is simply absent from the result. + pub(crate) fn contains_values( + &self, + needles: &[OrderableScalarValue], + ) -> Result> { + if needles.is_empty() { + return Ok(BTreeSet::new()); + } + let query = SargableQuery::IsIn(needles.iter().map(|v| v.0.clone()).collect()); + let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); + let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + let predicate = expr.evaluate(&self.data)?; + let predicate = predicate.into_array(self.data.num_rows())?; + let predicate = predicate + .as_any() + .downcast_ref::() + .expect("Predicate should return boolean array"); + let matched = arrow_select::filter::filter(self.values(), predicate)?; + (0..matched.len()) + .map(|i| { + Ok(OrderableScalarValue(ScalarValue::try_from_array( + &matched, i, + )?)) + }) + .collect() + } + pub fn all(&self) -> NullableRowAddrSet { // Some rows will be in both sets but that is ok, null trumps true NullableRowAddrSet::new(self.all_addrs_map.clone(), self.null_addrs_map.clone()) diff --git a/rust/lance/src/dataset/mem_wal/index.rs b/rust/lance/src/dataset/mem_wal/index.rs index 116ea6c60ce..208971f7be6 100644 --- a/rust/lance/src/dataset/mem_wal/index.rs +++ b/rust/lance/src/dataset/mem_wal/index.rs @@ -18,10 +18,14 @@ mod arena_skiplist; mod btree; mod fts; mod hnsw; +mod pk_key; use std::collections::HashMap; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use datafusion::common::ScalarValue; + use super::memtable::batch_store::StoredBatch; use arrow_array::RecordBatch; use lance_core::datatypes::Schema as LanceSchema; @@ -44,6 +48,32 @@ pub type RowPosition = u64; pub use btree::{BTreeIndexConfig, BTreeMemIndex}; pub use fts::{FtsIndexConfig, FtsMemIndex, FtsQueryExpr, SearchOptions}; pub use hnsw::{HnswIndexConfig, HnswMemIndex}; +pub use pk_key::encode_pk_tuple; + +use pk_key::encode_pk_batch; + +/// Synthetic column the composite PK index is keyed on: the order-preserving +/// encoded tuple (see [`encode_pk_tuple`]), stored as `Binary` so a +/// [`BTreeMemIndex`]'s byte backend indexes it directly. +const PK_KEY_COLUMN: &str = "__pk_key__"; + +/// The memtable's primary-key index, used to answer "newest visible version of +/// this key" for dedup. Single-column PKs reuse the column's compact typed +/// [`BTreeMemIndex`] (no second copy); composite PKs key a `BTreeMemIndex` on +/// the order-preserving encoded tuple ([`encode_pk_tuple`]) instead. Either way +/// the lookup is a single seek on one `BTreeMemIndex`. +enum PkIndex { + /// Arity 1: aliases a `btree_indexes` entry, so the insert loop maintains it. + Single(Arc), + /// Arity >= 2: a `BTreeMemIndex` over the encoded-tuple `Binary` key, + /// maintained explicitly in the insert paths (the original batch lacks the + /// synthetic key column). `columns` are the PK columns in order, resolved + /// against each batch's schema at insert time. + Composite { + index: Arc, + columns: Vec, + }, +} // ============================================================================ // Index Store @@ -195,12 +225,17 @@ impl MemIndexConfig { /// therefore safe for scanners to read. Scanners snapshot this at plan /// construction time so every plan keys on a stable MVCC cursor. pub struct IndexStore { - /// BTree indexes keyed by index name. - btree_indexes: HashMap, + /// BTree indexes keyed by index name. `Arc` so the primary-key BTrees can be + /// shared into [`Self::pk_btrees`] without a second copy or a second insert. + btree_indexes: HashMap>, /// HNSW vector indexes keyed by index name. hnsw_indexes: HashMap, /// FTS indexes keyed by index name. fts_indexes: HashMap, + /// The primary-key index (single-column or composite), or `None` without a + /// primary key. Queried via [`Self::pk_newest_visible`] (see + /// [`Self::enable_pk_index`]). + pk_index: Option, /// Maximum batch position that is durable in the WAL and therefore /// visible to scanners. Advanced unconditionally after a WAL append /// succeeds; not gated on whether any indexes are configured. @@ -213,6 +248,7 @@ impl Default for IndexStore { btree_indexes: HashMap::new(), hnsw_indexes: HashMap::new(), fts_indexes: HashMap::new(), + pk_index: None, max_visible_batch_position: AtomicUsize::new(0), } } @@ -230,6 +266,16 @@ impl std::fmt::Debug for IndexStore { &self.hnsw_indexes.keys().collect::>(), ) .field("fts_indexes", &self.fts_indexes.keys().collect::>()) + .field( + "pk_index", + &match &self.pk_index { + None => "none".to_string(), + Some(PkIndex::Single(b)) => format!("single({})", b.column_name()), + Some(PkIndex::Composite { columns, .. }) => { + format!("composite({})", columns.join(", ")) + } + }, + ) .field( "max_visible_batch_position", &self.max_visible_batch_position.load(Ordering::Acquire), @@ -264,7 +310,7 @@ impl IndexStore { for config in configs { match config { MemIndexConfig::BTree(c) => { - let index = BTreeMemIndex::new(c.field_id, c.column.clone()); + let index = Arc::new(BTreeMemIndex::new(c.field_id, c.column.clone())); registry.btree_indexes.insert(c.name.clone(), index); } MemIndexConfig::Hnsw(c) => { @@ -293,7 +339,7 @@ impl IndexStore { /// the production memtable path goes through [`Self::from_configs`]. pub fn add_btree(&mut self, name: String, field_id: i32, column: String) { self.btree_indexes - .insert(name, BTreeMemIndex::new(field_id, column)); + .insert(name, Arc::new(BTreeMemIndex::new(field_id, column))); } /// Add an HNSW vector index with default build parameters. @@ -362,6 +408,158 @@ impl IndexStore { .insert(name, FtsMemIndex::with_params(field_id, column, params)); } + /// Maintain a primary-key index so the memtable can answer "newest visible + /// version of this key" (see [`Self::pk_newest_visible`]). + /// + /// Single-column PKs reuse an existing BTree on the field, else auto-create + /// one under a `__pk__*` name so the normal insert loop maintains it (no + /// second copy). Composite (arity >= 2) PKs key a `BTreeMemIndex` on the + /// order-preserving encoded tuple (synthetic `PK_KEY_COLUMN`), maintained + /// explicitly in the insert paths. Call once at construction, after + /// [`Self::from_configs`] and before any inserts; a no-op when `pk_columns` + /// is empty. + pub fn enable_pk_index(&mut self, pk_columns: &[(String, i32)]) { + self.pk_index = match pk_columns { + [] => None, + [(column, field_id)] => { + let btree = match self + .btree_indexes + .values() + .find(|b| b.field_id() == *field_id) + { + Some(existing) => existing.clone(), + None => { + let btree = Arc::new(BTreeMemIndex::new(*field_id, column.clone())); + self.btree_indexes + .insert(format!("__pk__{column}"), btree.clone()); + btree + } + }; + Some(PkIndex::Single(btree)) + } + multi => Some(PkIndex::Composite { + // Synthetic field id (-1): the composite index is held directly, + // never resolved by field id. + index: Arc::new(BTreeMemIndex::new(-1, PK_KEY_COLUMN.to_string())), + columns: multi.iter().map(|(c, _)| c.clone()).collect(), + }), + }; + } + + /// Whether the memtable has a primary-key index. + pub fn has_pk_index(&self) -> bool { + self.pk_index.is_some() + } + + /// Sorted `(value, row_id)` training batches for the flushed on-disk PK + /// BTree (the sidecar dedup index). Single-column emits the typed PK value; + /// composite emits the order-preserving `Binary` encoded tuple. Empty when + /// there is no primary key. Row positions line up 1:1 with the forward- + /// written data file, so they are the flushed row ids directly. + pub fn pk_training_batches(&self, batch_size: usize) -> Result> { + match &self.pk_index { + None => Ok(Vec::new()), + Some(PkIndex::Single(btree)) => btree.to_training_batches(batch_size), + Some(PkIndex::Composite { index, .. }) => index.to_training_batches(batch_size), + } + } + + /// Resolve the PK columns' positions in `batch` (composite insert helper). + fn pk_batch_indices(batch: &RecordBatch, columns: &[String]) -> Result> { + columns + .iter() + .map(|c| { + batch + .schema() + .column_with_name(c) + .map(|(i, _)| i) + .ok_or_else(|| { + Error::invalid_input(format!("PK column '{c}' not found in batch")) + }) + }) + .collect() + } + + /// Maintain the composite PK index for `batch` (no-op for single/no PK): + /// encode the PK columns into the synthetic `PK_KEY_COLUMN` `Binary` column + /// and feed that to the keyed `BTreeMemIndex`. + fn insert_composite_pk(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { + if let Some(PkIndex::Composite { index, columns }) = &self.pk_index { + let pk_indices = Self::pk_batch_indices(batch, columns)?; + let encoded = encode_pk_batch(batch, &pk_indices)?; + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + PK_KEY_COLUMN, + arrow_schema::DataType::Binary, + false, + )])); + let key_batch = RecordBatch::try_new(schema, vec![Arc::new(encoded)]) + .map_err(|e| Error::invalid_input(e.to_string()))?; + index.insert(&key_batch, row_offset)?; + } + Ok(()) + } + + /// The newest row position of the primary-key tuple `values` (in PK order) + /// visible at `max_visible_row`, or `None`. A single seek either way: + /// single-column probes the typed BTree; composite probes the encoded-tuple + /// index. Collision-free, since `position` is the row identity. + pub fn pk_newest_visible( + &self, + values: &[ScalarValue], + max_visible_row: RowPosition, + ) -> Option { + match &self.pk_index { + None => None, + Some(PkIndex::Single(btree)) => btree.get_newest_visible(&values[0], max_visible_row), + Some(PkIndex::Composite { index, .. }) => { + // An unsupported PK type would have failed at insert, so the + // index can't hold a tuple this fails to encode. The probe key is + // the same `Binary`-encoded tuple the insert path indexed. + let key = encode_pk_tuple(values).ok()?; + index.get_newest_visible(&ScalarValue::Binary(Some(key)), max_visible_row) + } + } + } + + /// Whether `position` is the newest visible row of `values` — the recency + /// check the active index-search arms apply to drop predicate-crossing + /// stale hits. Callers gate on [`Self::has_pk_index`] first, since this is + /// `false` (drop) when the memtable has no primary-key index. + pub fn pk_is_newest( + &self, + values: &[ScalarValue], + position: RowPosition, + max_visible_row: RowPosition, + ) -> bool { + self.pk_newest_visible(values, max_visible_row) == Some(position) + } + + /// Whether `key` has any version visible at `max_visible_row` — the + /// cross-source block-list's existence query, snapshot-bounded so a + /// not-yet-visible write can't shadow an older visible copy. + /// + /// `key` is already in the index's key space: the typed PK value for a + /// single-column key, the `Binary`-encoded tuple for a composite one (built + /// by `block_list::on_disk_pk_key`, the same key the flushed on-disk index is + /// probed with). Both arities forward it straight to the keyed BTree. + pub fn pk_contains_key(&self, key: &ScalarValue, max_visible_row: RowPosition) -> bool { + match &self.pk_index { + None => false, + Some(PkIndex::Single(btree)) | Some(PkIndex::Composite { index: btree, .. }) => { + btree.get_newest_visible(key, max_visible_row).is_some() + } + } + } + + /// Whether the primary-key index holds no rows (or doesn't exist). + pub fn pk_is_empty(&self) -> bool { + match &self.pk_index { + None => true, + Some(PkIndex::Single(btree)) => btree.is_empty(), + Some(PkIndex::Composite { index, .. }) => index.is_empty(), + } + } + /// Insert a batch into all indexes. pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { self.insert_with_batch_position(batch, row_offset, None) @@ -384,6 +582,9 @@ impl IndexStore { for index in self.fts_indexes.values() { index.insert(batch, row_offset)?; } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + self.insert_composite_pk(batch, row_offset)?; // Update global watermark after all indexes have been updated if let Some(bp) = batch_position { @@ -440,6 +641,12 @@ impl IndexStore { } } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -552,6 +759,14 @@ impl IndexStore { .map(|(name, _idx_type, duration)| (name.to_string(), duration)) .collect(); + // Single-column PK aliases a `btree_indexes` entry — its thread above + // already maintained it (and joined). A composite PK has its own + // index; maintain it here before the watermark advances so the + // visible prefix is fully indexed. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -562,7 +777,7 @@ impl IndexStore { /// Get a BTree index by name. pub fn get_btree(&self, name: &str) -> Option<&BTreeMemIndex> { - self.btree_indexes.get(name) + self.btree_indexes.get(name).map(Arc::as_ref) } /// Get an HNSW vector index by name. @@ -583,6 +798,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.field_id() == field_id) + .map(Arc::as_ref) } /// Get an HNSW vector index by field ID. @@ -607,6 +823,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.column_name() == column) + .map(Arc::as_ref) } /// Get an HNSW vector index by column name. @@ -694,6 +911,73 @@ mod tests { .unwrap() } + /// Single-column `id` batch for primary-key lookup tests. + fn id_batch(ids: &[i32]) -> RecordBatch { + RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(ids.to_vec()))], + ) + .unwrap() + } + + #[test] + fn pk_newest_visible_single_column() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0)]); + // id=1 at positions 0 and 2 (an update), id=2 at position 1. + store.insert(&id_batch(&[1, 2]), 0).unwrap(); + store.insert(&id_batch(&[1]), 2).unwrap(); + + let one = [ScalarValue::Int32(Some(1))]; + // Watermark above the update sees the newest position; below it, the older. + assert_eq!(store.pk_newest_visible(&one, 5), Some(2)); + assert_eq!(store.pk_newest_visible(&one, 1), Some(0)); + assert!(store.pk_is_newest(&one, 2, 5)); + assert!(!store.pk_is_newest(&one, 0, 5)); + // Absent key (probed by the typed value, as the block-list does). + assert!(!store.pk_contains_key(&ScalarValue::Int32(Some(9)), 5)); + } + + #[test] + fn pk_newest_visible_composite_seeks_encoded_tuple() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0), ("name".to_string(), 1)]); + // Rows: (1,"a")@0, (1,"b")@1, (1,"a")@2 — an update of (1,"a"). + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 1, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a"])), + ], + ) + .unwrap(); + store.insert(&batch, 0).unwrap(); + + let tuple_1a = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + let tuple_1b = [ScalarValue::Int32(Some(1)), ScalarValue::from("b")]; + // (1,"a")'s newest visible row is its re-write at position 2. + assert_eq!(store.pk_newest_visible(&tuple_1a, 5), Some(2)); + assert!(store.pk_is_newest(&tuple_1a, 2, 5)); + assert!(!store.pk_is_newest(&tuple_1a, 0, 5)); + // (1,"b") only exists at position 1. + assert_eq!(store.pk_newest_visible(&tuple_1b, 5), Some(1)); + // Watermark below the re-write: the older (1,"a")@0 is the newest visible. + assert_eq!(store.pk_newest_visible(&tuple_1a, 1), Some(0)); + // An absent tuple (probed by its Binary-encoded key, as the block-list + // does). + let tuple_2a = [ScalarValue::Int32(Some(2)), ScalarValue::from("a")]; + let key_2a = ScalarValue::Binary(Some(encode_pk_tuple(&tuple_2a).unwrap())); + assert!(!store.pk_contains_key(&key_2a, 5)); + } + #[test] fn test_index_registry() { let schema = create_test_schema(); diff --git a/rust/lance/src/dataset/mem_wal/index/pk_key.rs b/rust/lance/src/dataset/mem_wal/index/pk_key.rs new file mode 100644 index 00000000000..b31fe42c995 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/index/pk_key.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Composite primary-key encoding for MemWAL dedup. +//! +//! A multi-column primary key is reduced to a single order-preserving byte +//! string ([`encode_pk_tuple`]) so the whole tuple is one comparable key: +//! lexicographic byte order equals tuple order, and distinct tuples never +//! collide. Encoded as a `Binary` value, the tuple is indexed directly by a +//! [`super::BTreeMemIndex`] (its byte backend) — both in memory and, after +//! flush, as the on-disk BTree's `Binary` value column — so a probe builds +//! `ScalarValue::Binary(key)` and every layer agrees. +//! +//! Single-column primary keys do **not** use this — they key the typed +//! `BTreeMemIndex` on the column value directly. + +use arrow_array::{BinaryArray, RecordBatch}; +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +/// Sign-flip a signed integer to an order-preserving unsigned key (matches the +/// fixed-int BTree backend). Big-endian bytes of the result sort like the value. +#[inline] +fn encode_signed(v: i64) -> u64 { + (v as u64) ^ (1u64 << 63) +} + +/// Append an order-preserving encoding of one non-null byte string: each `0x00` +/// is escaped to `0x00 0xFF`, then a `0x00 0x00` terminator is appended. The +/// terminator sorts before any escaped content, so a prefix orders before its +/// extensions and no value can forge a column boundary. +fn encode_bytes(out: &mut Vec, bytes: &[u8]) { + for &b in bytes { + out.push(b); + if b == 0x00 { + out.push(0xFF); + } + } + out.extend_from_slice(&[0x00, 0x00]); +} + +/// Append the order-preserving encoding of a single PK column value. A leading +/// tag (`0x00` null / `0x01` non-null) makes nulls sort first and keeps the +/// per-column encoding self-delimiting (fixed-width for ints, terminated for +/// bytes), so concatenating columns stays injective and order-preserving. +fn encode_value(out: &mut Vec, value: &ScalarValue) -> Result<()> { + if value.is_null() { + out.push(0x00); + return Ok(()); + } + out.push(0x01); + macro_rules! be_signed { + ($v:expr) => { + out.extend_from_slice(&encode_signed($v as i64).to_be_bytes()) + }; + } + match value { + ScalarValue::Int8(Some(v)) => be_signed!(*v), + ScalarValue::Int16(Some(v)) => be_signed!(*v), + ScalarValue::Int32(Some(v)) => be_signed!(*v), + ScalarValue::Int64(Some(v)) => be_signed!(*v), + ScalarValue::Date32(Some(v)) => be_signed!(*v), + ScalarValue::Date64(Some(v)) => be_signed!(*v), + ScalarValue::UInt8(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt16(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt32(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt64(Some(v)) => out.extend_from_slice(&v.to_be_bytes()), + ScalarValue::Boolean(Some(b)) => out.push(*b as u8), + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + encode_bytes(out, s.as_bytes()) + } + ScalarValue::Binary(Some(b)) + | ScalarValue::LargeBinary(Some(b)) + | ScalarValue::FixedSizeBinary(_, Some(b)) => encode_bytes(out, b), + other => { + return Err(Error::invalid_input(format!( + "Unsupported primary-key column type for composite key: {other:?}" + ))); + } + } + Ok(()) +} + +/// Encode a PK tuple (values in PK column order) to one order-preserving key. +pub fn encode_pk_tuple(values: &[ScalarValue]) -> Result> { + let mut out = Vec::with_capacity(values.len() * 9); + for value in values { + encode_value(&mut out, value)?; + } + Ok(out) +} + +/// Encode row `row` of `batch`'s PK columns (at `pk_indices`) to one key. +fn encode_pk_row(batch: &RecordBatch, pk_indices: &[usize], row: usize) -> Result> { + let mut out = Vec::with_capacity(pk_indices.len() * 9); + for &col in pk_indices { + let value = ScalarValue::try_from_array(batch.column(col), row)?; + encode_value(&mut out, &value)?; + } + Ok(out) +} + +/// Encode every row of `batch`'s PK columns (at `pk_indices`) into a `Binary` +/// column of order-preserving composite keys — the form a [`super::BTreeMemIndex`] +/// indexes directly (its byte backend), so the composite PK reuses the same +/// index as a single-column one. +pub fn encode_pk_batch(batch: &RecordBatch, pk_indices: &[usize]) -> Result { + let mut keys: Vec> = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keys.push(encode_pk_row(batch, pk_indices, row)?); + } + Ok(BinaryArray::from_iter_values(keys.iter())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn tuple(a: i32, b: &str) -> Vec { + vec![ScalarValue::Int32(Some(a)), ScalarValue::from(b)] + } + + #[test] + fn encoding_is_order_preserving_and_injective() { + // Sorting tuples by their encoding must match tuple order, and distinct + // tuples must produce distinct bytes. + let tuples = [ + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + tuple(-1, "z"), + ]; + let mut encoded: Vec<(Vec, &Vec)> = tuples + .iter() + .map(|t| (encode_pk_tuple(t).unwrap(), t)) + .collect(); + encoded.sort_by(|x, y| x.0.cmp(&y.0)); + let order: Vec<_> = encoded.iter().map(|(_, t)| (*t).clone()).collect(); + // -1 < 1 < 2; within id=1, "a" < "ab" < "b". + assert_eq!( + order, + vec![ + tuple(-1, "z"), + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + ] + ); + // Injective: 5 distinct tuples → 5 distinct keys. + let mut keys: Vec> = tuples.iter().map(|t| encode_pk_tuple(t).unwrap()).collect(); + keys.sort(); + keys.dedup(); + assert_eq!(keys.len(), 5); + } + + #[test] + fn null_sorts_first_and_is_distinct() { + let null_a = vec![ScalarValue::Int32(None), ScalarValue::from("a")]; + let one_a = tuple(1, "a"); + assert!(encode_pk_tuple(&null_a).unwrap() < encode_pk_tuple(&one_a).unwrap()); + assert_ne!( + encode_pk_tuple(&null_a).unwrap(), + encode_pk_tuple(&one_a).unwrap() + ); + } + + #[test] + fn prefix_safety_with_embedded_zero() { + // A string containing 0x00 must not collide with or sort incorrectly + // against a shorter one (escaping + terminator). + let with_zero = vec![ScalarValue::Binary(Some(vec![0x00]))]; + let empty = vec![ScalarValue::Binary(Some(vec![]))]; + assert!(encode_pk_tuple(&empty).unwrap() < encode_pk_tuple(&with_zero).unwrap()); + } + + #[test] + fn encode_pk_batch_matches_per_tuple_encoding() { + // Each row of the encoded `Binary` column equals `encode_pk_tuple` of + // that row's PK values — so the column a BTreeMemIndex indexes is exactly + // what a probe builds. + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2, 1])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let encoded = encode_pk_batch(&batch, &[0, 1]).unwrap(); + assert_eq!(encoded.value(0), encode_pk_tuple(&tuple(2, "a")).unwrap()); + assert_eq!(encoded.value(1), encode_pk_tuple(&tuple(1, "b")).unwrap()); + // (1,"b") encodes below (2,"a"). + assert!(encoded.value(1) < encoded.value(0)); + } +} diff --git a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs index f4d4d797acc..054d9b1630e 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs @@ -615,6 +615,22 @@ impl BatchStore { (0..end).collect() } + /// The inclusive maximum visible *row* position at `max_visible_batch_position`, + /// or `None` when no rows are visible. The visible batches are the committed + /// prefix `[0, last_visible_idx]`; each batch carries its cumulative + /// `row_offset`, so this is the end of the last visible batch minus one. + /// Used to bound MVCC seeks against the maintained PK-position index. + pub fn max_visible_row(&self, max_visible_batch_position: usize) -> Option { + let len = self.committed_len.load(Ordering::Acquire); + if len == 0 { + return None; + } + let last_visible_idx = max_visible_batch_position.min(len - 1); + let last = self.get(last_visible_idx)?; + let visible_end = last.row_offset + last.num_rows as u64; // exclusive + visible_end.checked_sub(1) + } + /// Check if a specific batch is visible at a given visibility position. #[inline] pub fn is_batch_visible( @@ -910,6 +926,37 @@ mod tests { assert!(!store.is_batch_visible(3, 10)); } + #[test] + fn test_max_visible_row() { + // (1) Empty store: no rows are visible at any position. + let store = BatchStore::with_capacity(10); + assert_eq!(store.max_visible_row(0), None); + assert_eq!(store.max_visible_row(100), None); + + // Three batches → rows [0,10) [10,30) [30,60); row_offsets 0, 10, 30. + store.append(create_test_batch(10)).unwrap(); // position 0 + store.append(create_test_batch(20)).unwrap(); // position 1 + store.append(create_test_batch(30)).unwrap(); // position 2 + + // (2) A position within range yields the inclusive end of that prefix. + assert_eq!(store.max_visible_row(0), Some(9)); // batch 0: 0..10 + assert_eq!(store.max_visible_row(1), Some(29)); // batch 1: 10..30 + assert_eq!(store.max_visible_row(2), Some(59)); // batch 2: 30..60 + + // (3) A position beyond the committed range clamps to the last batch, + // i.e. the inclusive max over all rows. + assert_eq!(store.max_visible_row(100), Some(59)); + + // (4) An empty leading batch contributes no rows: at its own position + // the inclusive end underflows to None, while a later non-empty batch + // is reported correctly. + let store = BatchStore::with_capacity(10); + store.append(create_test_batch(0)).unwrap(); // position 0: rows [0,0) + store.append(create_test_batch(5)).unwrap(); // position 1: rows [0,5) + assert_eq!(store.max_visible_row(0), None); // empty prefix → no rows + assert_eq!(store.max_visible_row(1), Some(4)); // through batch 1 + } + #[test] fn test_recommended_capacity() { // 64MB memtable, 64KB avg batch = 1024 batches * 1.2 = ~1228 diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index c4794d4c8f3..be0a66d7d2c 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -178,6 +178,12 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Write the standalone primary-key dedup sidecar. A primary key needs + // no secondary index, so this is required on the plain-flush path too — + // the LSM scanner opens it to dedup the generation. (`flush_with_indexes` + // writes it on the indexed path.) No-op when the memtable has no PK. + self.create_pk_index(&gen_path, memtable.indexes()).await?; + let new_manifest = self .update_manifest( epoch, @@ -449,6 +455,10 @@ impl MemTableFlusher { all_indexes.extend(fts_indexes); } + // Write the standalone primary-key dedup index (sidecar, not a manifest + // index — the block-list opens it directly by path). + self.create_pk_index(&gen_path, memtable.indexes()).await?; + // Write a single manifest that records the fragments, the // within-generation deletion vector, and all indexes, overwriting the // data-only v1 manifest created by Dataset::write. @@ -543,6 +553,49 @@ impl MemTableFlusher { Ok(created_indexes) } + /// Write the standalone primary-key dedup index for this generation. + /// + /// Unlike user indexes, this is a **sidecar**: it is not registered in the + /// manifest. The block-list opens it directly by path + /// ([`pk_index_path`]) and probes it with `Equals`. Single-column primary + /// keys index the typed value; composite keys index the order-preserving + /// `Binary` encoded tuple (see [`super::super::index::encode_pk_tuple`]). + /// Row positions line up 1:1 with the forward-written data file, so they are + /// the flushed row ids directly. No-op without a primary-key index. + async fn create_pk_index( + &self, + gen_path: &Path, + mem_indexes: Option<&super::super::index::IndexStore>, + ) -> Result<()> { + use datafusion::physical_plan::SendableRecordBatchStream; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_index::scalar::btree::train_btree_index; + use lance_index::scalar::lance_format::LanceIndexStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let Some(registry) = mem_indexes else { + return Ok(()); + }; + let batches = registry.pk_training_batches(8192)?; + if batches.is_empty() { + return Ok(()); + } + + let schema = batches[0].schema(); + let store = LanceIndexStore::new( + self.object_store.clone(), + pk_index_path(gen_path), + Arc::new(LanceCache::no_cache()), + ); + let stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches.into_iter().map(Ok)), + )); + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) + } + /// Create FTS (Full-Text Search) indexes from in-memory data (uncommitted). /// /// Writes the FTS index files and returns index metadata without committing. @@ -1227,6 +1280,202 @@ mod tests { assert_eq!(rows.get(&3), Some(&"c2".to_string())); } + /// Flushing a memtable with a primary-key index writes a standalone sidecar + /// BTree at `{gen}/_pk_index` that the block-list can reopen by path and + /// probe by value — including for a within-gen-superseded PK (existence, + /// not visibility). + #[tokio::test] + async fn flushed_pk_index_sidecar_is_probeable() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + // id=1 updated in-gen (a -> a2); id=2 unique. + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a2"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + let result = flusher + .flush_with_indexes(&memtable, epoch, &[], 1) + .await + .unwrap(); + + // Reopen the sidecar directly by path (the block-list's route). + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + // Both PKs present (id=1 even though its first version was superseded); + // an absent PK is not. + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + + /// Regression: production dispatches a PK-only flush (a primary key, no + /// secondary index) to `flush`, not `flush_with_indexes`. `flush` must still + /// write the PK dedup sidecar, otherwise cross-generation dedup fails with + /// `page_lookup.lance not found`. + #[tokio::test] + async fn plain_flush_writes_pk_sidecar() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + // The plain-flush path — what the writer dispatches to with no indexes. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + /// Covers `finalize_generation` writing both a deletion vector *and* /// indexes into the same manifest — the deletion-only and index-only /// paths are exercised by sibling tests. diff --git a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs index 2c5192e28a1..17fa9c76a65 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs @@ -366,6 +366,14 @@ impl MemTableScanner { self } + /// The `max_visible_batch_position` snapshot this scanner latched at + /// construction. A downstream recency filter must key on this same snapshot + /// (not a fresh read of the IndexStore watermark, which a concurrent append + /// could have advanced) so it stays consistent with the rows the search saw. + pub fn max_visible_batch_position(&self) -> usize { + self.max_visible_batch_position + } + /// Include the _rowaddr column in output. /// /// Same value as _rowid but named for compatibility with LSM scanner. diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs index b1766f8525f..f6942681223 100644 --- a/rust/lance/src/dataset/mem_wal/scanner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner.rs @@ -43,6 +43,7 @@ mod point_lookup; mod projection; mod vector_search; +pub use block_list::write_pk_sidecar; pub use builder::LsmScanner; pub use collector::{ ActiveMemTableRef, InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector, diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index 8a293c3f988..fe197772492 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -3,37 +3,151 @@ //! Per-source block-list construction for LSM vector search. //! -//! A generation's membership is an `Arc>` of PK hashes -//! ([`compute_pk_hash`]), built once (immutable gens cached). Each source gets a -//! `Vec>>` of the newer generations' sets (`NEWER(G)`; base: all -//! of them) — referenced, never merged. The KNN drops candidates whose PK is in -//! any (see [`super::exec::PkHashFilterExec`]). +//! A generation's membership is a [`GenMembership`]: in-memory generations +//! (active / frozen) are probed by value against their maintained primary-key +//! index (no per-query set), while flushed generations are probed against their +//! standalone on-disk PK BTree (the sidecar written at flush, opened by path). +//! Probing is batched — [`GenMembership::contains_keys`] tests a whole batch of +//! keys per generation in one pass. Each source gets a `Vec` of +//! the newer generations (`NEWER(G)`; base: all of them); the KNN drops a +//! candidate whose PK any of them contains (see +//! [`super::exec::PkBlockFilterExec`]). //! -//! Cross-generation only: within-gen dups share a hash and fall to the global -//! dedup's `(generation, freshness)` tiebreaker. - -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; - -use arrow_array::RecordBatch; -use futures::TryStreamExt; -use lance_core::Result; - +//! Cross-generation only: within-gen dups collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. + +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::btree::BTreeIndex; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::{ + IndexStore as ScalarIndexStore, SargableQuery, ScalarIndex, SearchResult, +}; use uuid::Uuid; use super::data_source::{LsmDataSource, LsmGeneration}; -use super::exec::{compute_pk_hash, resolve_pk_indices}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; -use crate::dataset::Dataset; -use crate::dataset::mem_wal::write::BatchStore; +use crate::dataset::mem_wal::index::encode_pk_tuple; +use crate::dataset::mem_wal::util::PK_INDEX_DIR; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use crate::session::Session; -/// Per-source blocked PK-hash sets, keyed by `(shard_id, generation)`. Each -/// value is the membership sets of the generations newer than that source. -pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>>>; +/// Default-plugin registry, used only to load the standalone PK BTree by its +/// `BTreeIndexDetails` type. Built once. +static PK_BTREE_REGISTRY: LazyLock> = + LazyLock::new(IndexPluginRegistry::with_default_plugins); + +/// One newer generation's PK membership, used to decide whether it shadows an +/// older source's row. +#[derive(Clone, Debug)] +pub enum GenMembership { + /// Probe the in-memory memtable's primary-key index, bounded to its visible + /// prefix (so a not-yet-visible write can't shadow an older visible copy). + InMemory { + index_store: Arc, + /// Inclusive visible row watermark; `None` when no rows are visible. + max_visible_row: Option, + }, + /// Probe the flushed generation's standalone on-disk PK BTree. + OnDisk(Arc), +} + +impl GenMembership { + /// Whether this generation visibly contains the primary `key` — the typed + /// value for a single-column PK, the encoded `Binary` tuple for a composite + /// one (built by [`on_disk_pk_key`]). The same key probes the in-memory + /// BTree and the flushed on-disk BTree, which now share a key space. + pub async fn contains(&self, key: &ScalarValue) -> Result { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))), + Self::OnDisk(index) => { + let result = index + .search(&SargableQuery::Equals(key.clone()), &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string()))?; + Ok(!search_is_empty(&result)) + } + } + } + + /// Batched [`Self::contains`]: for each key in `keys`, whether this + /// generation visibly contains it, returned as a mask aligned to `keys`. + /// + /// One probe replaces N. The on-disk arm issues a single + /// [`BTreeIndex::contains_keys`] (no per-key `SearchResult` allocation); the + /// in-memory arm maps the sync, allocation-free PK lookup over the slice. + /// Keys are in the index's key space (see [`on_disk_pk_key`]). + pub async fn contains_keys(&self, keys: &[ScalarValue]) -> Result> { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(keys + .iter() + .map(|key| max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))) + .collect()), + Self::OnDisk(index) => { + // The flushed PK sidecar is always a BTree (built via + // `PK_BTREE_REGISTRY`); downcast to reach the batched probe. + let btree = index.as_any().downcast_ref::().ok_or_else(|| { + Error::io("flushed PK dedup index is not a BTree".to_string()) + })?; + btree + .contains_keys(keys, &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string())) + } + } + } + + /// Whether this generation has no (visible) membership — used to skip adding + /// an empty blocked set. A flushed generation always has rows (flush rejects + /// an empty memtable), so it is never empty. + fn is_empty(&self) -> bool { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => max_visible_row.is_none() || index_store.pk_is_empty(), + Self::OnDisk(_) => false, + } + } +} -/// A shard's generations paired with their PK-hash membership, before sorting. -type ShardGenSets = HashMap>)>>; +/// Whether a scalar search returned no rows (existence test for the block-list). +fn search_is_empty(result: &SearchResult) -> bool { + match result { + SearchResult::Exact(set) | SearchResult::AtMost(set) | SearchResult::AtLeast(set) => { + set.is_empty() + } + } +} + +/// The probe key for the on-disk PK BTree: a single-column PK indexes its typed +/// value directly; a composite PK indexes the order-preserving encoded tuple as +/// `Binary` (matching what flush wrote — see [`encode_pk_tuple`]). +pub fn on_disk_pk_key(values: &[ScalarValue]) -> Result { + match values { + [single] => Ok(single.clone()), + _ => Ok(ScalarValue::Binary(Some(encode_pk_tuple(values)?))), + } +} + +/// Per-source blocked memberships, keyed by `(shard_id, generation)`. Each value +/// is the memberships of the generations newer than that source. +pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>; + +/// A shard's generations paired with their membership, before sorting. +type ShardGenSets = HashMap>; /// Per-source `NEWER(G)`, keyed by `(shard_id, generation)`. Generations are /// per-shard, so a source is superseded only by strictly-newer generations of @@ -42,31 +156,28 @@ type ShardGenSets = HashMap>)>>; /// Only superseded sources get an entry; the newest of each shard never does. pub async fn compute_source_block_lists( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, ) -> Result { - // Hash each non-base source's membership, grouped by shard (generations are - // per-shard, so supersession is within-shard only). Flushed-generation PK - // scans (cold-cache S3 reads) run concurrently; order is irrelevant — the - // per-shard lists are sorted by generation below. + // Membership per non-base source, grouped by shard (generations are + // per-shard, so supersession is within-shard only). let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; - let mut flushed_loads = Vec::new(); for source in sources { match source { LsmDataSource::BaseTable { .. } => has_base = true, LsmDataSource::ActiveMemTable { batch_store, + index_store, shard_id, generation, .. } => { - let hashes = Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?); + let membership = in_memory_membership(batch_store, index_store); by_shard .entry(*shard_id) .or_default() - .push((*generation, hashes)); + .push((*generation, membership)); } LsmDataSource::FlushedMemTable { path, @@ -74,36 +185,29 @@ pub async fn compute_source_block_lists( generation, .. } => { - // Cached by immutable path so repeated searches skip the PK scan. - flushed_loads.push(async move { - flushed_pk_hashes(path, pk_columns, session, flushed_cache) - .await - .map(|hashes| (*shard_id, *generation, hashes)) - }); + let index = open_pk_index(path, session, flushed_cache).await?; + by_shard + .entry(*shard_id) + .or_default() + .push((*generation, GenMembership::OnDisk(index))); } } } - for (shard_id, generation, hashes) in futures::future::try_join_all(flushed_loads).await? { - by_shard - .entry(shard_id) - .or_default() - .push((generation, hashes)); - } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. - let mut base_blocked: Vec>> = Vec::new(); + let mut base_blocked: Vec = Vec::new(); for (shard, mut gens) in by_shard { // Newest-first: a gen's blocked list is its own shard's newer gens. gens.sort_by_key(|(generation, _)| std::cmp::Reverse(*generation)); - let mut newer: Vec>> = Vec::new(); - for (generation, hashes) in gens { + let mut newer: Vec = Vec::new(); + for (generation, membership) in gens { if !newer.is_empty() { blocked.insert((Some(shard), generation), newer.clone()); } - if !hashes.is_empty() { - base_blocked.push(hashes.clone()); - newer.push(hashes); + if !membership.is_empty() { + base_blocked.push(membership.clone()); + newer.push(membership); } } } @@ -113,263 +217,271 @@ pub async fn compute_source_block_lists( Ok(blocked) } -/// The fresh-tier block-list: one membership set per generation that shadows the -/// base table — active + frozen memtables (hashed now) and flushed generations -/// (from the cache). Same `Vec>>` shape the vector-search filter -/// consumes; a base/external reader can drop any row whose PK is in one of them. -/// The base source, if present, is skipped (it is what gets shadowed). +/// The fresh-tier block-list: one [`GenMembership`] per generation that shadows +/// the base table — active + frozen memtables (probed against their index) and +/// flushed generations (probed against their on-disk PK BTree). A base/external +/// reader can test any PK against these (via [`GenMembership::contains`]) to +/// decide whether the fresh tier shadows it. The base source, if present, is +/// skipped (it is what gets shadowed). pub async fn fresh_tier_block_list( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, -) -> Result>>> { - // Flushed PK scans run concurrently (cold-cache S3 reads); ordered - // try_join_all keeps the source order of the returned sets. - let sets = futures::future::try_join_all(sources.iter().map(|source| async move { - Ok::<_, lance_core::Error>(match source { - LsmDataSource::BaseTable { .. } => None, - LsmDataSource::ActiveMemTable { batch_store, .. } => Some(Arc::new( - pk_hashes_from_batch_store(batch_store, pk_columns)?, - )), +) -> Result> { + let mut memberships = Vec::new(); + for source in sources { + let membership = match source { + LsmDataSource::BaseTable { .. } => continue, + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => in_memory_membership(batch_store, index_store), LsmDataSource::FlushedMemTable { path, .. } => { - Some(flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?) + GenMembership::OnDisk(open_pk_index(path, session, flushed_cache).await?) } - }) - })) - .await?; - Ok(sets - .into_iter() - .flatten() - .filter(|set| !set.is_empty()) - .collect()) -} - -/// Hash the PK membership of an in-memory memtable (active or frozen) from its -/// committed `BatchStore` rows. -pub fn pk_hashes_from_batch_store( - store: &BatchStore, - pk_columns: &[String], -) -> Result> { - let mut batches: Vec = Vec::with_capacity(store.len()); - for i in 0..store.len() { - if let Some(stored) = store.get(i) { - batches.push(stored.data.clone()); + }; + if !membership.is_empty() { + memberships.push(membership); } } - pk_hashes_from_batches(&batches, pk_columns) + Ok(memberships) } -/// Hash every row's primary key across `batches` into a membership set. -fn pk_hashes_from_batches(batches: &[RecordBatch], pk_columns: &[String]) -> Result> { - let mut pk_hashes = HashSet::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row_idx in 0..batch.num_rows() { - pk_hashes.insert(compute_pk_hash(batch, &pk_indices, row_idx)); - } +/// Cross-source membership of an in-memory (active / frozen) memtable: a +/// snapshot-bounded probe of its maintained primary-key index. A memtable +/// without a primary-key index can't be probed, so it blocks nothing — the +/// production vector-search path always enables the index. +fn in_memory_membership( + batch_store: &Arc, + index_store: &Arc, +) -> GenMembership { + let max_visible_row = batch_store.max_visible_row(index_store.max_visible_batch_position()); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - Ok(pk_hashes) } -/// Build (or fetch the cached) PK-hash membership for one flushed generation. -/// Cached by immutable path (single-flight); the build scans the flushed -/// dataset's PK columns. -async fn flushed_pk_hashes( +/// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed +/// generation. Reuses the flushed dataset's (session-configured) object store +/// and **its index cache**, then loads the sidecar directly by path through the +/// BTree plugin — it is not a manifest index. The opened index and its pages +/// are cached in the session's index cache (keyed by the immutable flushed +/// path), so repeated probes reuse them with no separate cache path and no +/// upfront scan; concurrent first-opens may each load before the cache fills. +/// A stable cache UUID for a non-manifest index identified only by its path. +/// +/// `DSIndexCache::for_index` keys by `&Uuid`, but the flushed PK sidecar has no +/// manifest UUID — its identity is its immutable path. Derive a deterministic +/// UUID from the path so the cache namespace is per-path and stable across +/// probes (the `uuid` crate lacks the `v5` "name-based" feature here, so hash to +/// a `u128` instead). +fn path_cache_uuid(path: &str) -> Uuid { + use std::hash::{Hash, Hasher}; + let mut lo = std::collections::hash_map::DefaultHasher::new(); + path.hash(&mut lo); + let mut hi = std::collections::hash_map::DefaultHasher::new(); + // Seed the high half differently so it never equals the low half. + "lance/flushed-pk-index".hash(&mut hi); + path.hash(&mut hi); + Uuid::from_u128(((hi.finish() as u128) << 64) | lo.finish() as u128) +} + +async fn open_pk_index( path: &str, - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, -) -> Result>> { - match flushed_cache { - Some(cache) => { - let build_cache = cache.clone(); - let build_path = path.to_string(); - let build_session = session.cloned(); - let build_pk = pk_columns.to_vec(); - cache - .get_or_build_pk_hashes( - path, - // `Box::pin` keeps this build future off the caller's future - // (avoids `clippy::large_futures`). - Box::pin(async move { - let dataset = open_flushed_dataset( - &build_path, - build_session.as_ref(), - Some(&build_cache), - ) - .await?; - scan_pk_hashes(&dataset, &build_pk).await - }), - ) - .await - } - None => { - let dataset = open_flushed_dataset(path, session, None).await?; - Ok(Arc::new(scan_pk_hashes(&dataset, pk_columns).await?)) - } +) -> Result> { + let dataset = open_flushed_dataset(path, session, flushed_cache).await?; + // Namespace the session index cache by the (immutable) flushed path so this + // sidecar's pages live alongside every other index instead of a bespoke + // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse. + let index_cache = dataset.index_cache.for_index(&path_cache_uuid(path), None); + let index_dir = dataset.base.clone().join(PK_INDEX_DIR); + let store: Arc = Arc::new(LanceIndexStore::new( + dataset.object_store.clone(), + index_dir, + Arc::new(index_cache.clone()), + )); + + let plugin = PK_BTREE_REGISTRY.get_plugin_by_name("BTree")?; + // Cache the opened index in the session cache (mirrors `open_scalar_index`). + if let Some(index) = plugin + .get_from_cache(store.clone(), None, &index_cache) + .await? + { + return Ok(index); } + let details = prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()) + .map_err(|e| Error::io(e.to_string()))?; + let index = plugin + .load_index(store, &details, None, &index_cache) + .await?; + plugin.put_in_cache(&index_cache, index.clone()).await?; + Ok(index) } -/// Scan a dataset's PK columns and fold them into a membership set, one batch -/// resident at a time (no full PK-column buffer). -async fn scan_pk_hashes(dataset: &Dataset, pk_columns: &[String]) -> Result> { - let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); - let mut scanner = dataset.scan(); - scanner.project(&pk_refs)?; - let mut stream = scanner.try_into_stream().await?; - let mut hashes = HashSet::new(); - while let Some(batch) = stream.try_next().await? { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(&batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row in 0..batch.num_rows() { - hashes.insert(compute_pk_hash(&batch, &pk_indices, row)); - } +/// Write a flushed generation's standalone PK sidecar at `{uri}/_pk_index` from +/// `batches`, mirroring what flush does in production. `pk_columns` are the +/// primary-key column names (field ids are synthesized by position — `insert` +/// resolves columns by name). A no-op when no batch carries the PK columns. +/// +/// Used by Rust scanner tests and by the Python test-support binding to stage +/// faithful flushed generations (a flushed dataset alone, with no sidecar, is +/// not a state production ever produces). +pub async fn write_pk_sidecar( + uri: &str, + batches: &[arrow_array::RecordBatch], + pk_columns: &[&str], +) -> Result<()> { + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_core::cache::LanceCache; + use lance_index::scalar::btree::train_btree_index; + use lance_io::object_store::ObjectStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let pk: Vec<(String, i32)> = pk_columns + .iter() + .enumerate() + .map(|(i, c)| (c.to_string(), i as i32)) + .collect(); + let mut index = IndexStore::new(); + index.enable_pk_index(&pk); + let mut offset = 0u64; + for batch in batches { + index.insert(batch, offset)?; + offset += batch.num_rows() as u64; } - Ok(hashes) + + let training = index.pk_training_batches(8192)?; + if training.is_empty() { + return Ok(()); + } + let schema = training[0].schema(); + let (object_store, base_path) = ObjectStore::from_uri(uri).await?; + let store = LanceIndexStore::new( + object_store, + pk_index_path(&base_path), + Arc::new(LanceCache::no_cache()), + ); + let stream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(training.into_iter().map(Ok)), + )); + // `train_btree_index` now returns the written index files; the sidecar + // writer only needs success/failure. + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) } #[cfg(test)] mod tests { use super::*; - use arrow_array::Int32Array; + use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; + use crate::dataset::mem_wal::write::IndexStore; + use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; use std::sync::Arc; + use uuid::Uuid; fn id_batch(ids: &[i32]) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() } - /// Hash a single Int32 `id` PK the way the planner does, so a test can probe - /// a returned blocked set by value. - fn hash_id(id: i32) -> u64 { - let batch = id_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - /// Whether `id`'s PK hash is blocked by any of a source's newer-gen sets. - fn blocks(sets: &[Arc>], id: i32) -> bool { - sets.iter().any(|s| s.contains(&hash_id(id))) - } - - #[test] - fn pk_hashes_collapse_within_gen_duplicates() { - // Two rows share pk=1 (a within-gen duplicate); pk=2 is unique. - let hashes = pk_hashes_from_batches(&[id_batch(&[1, 2, 1])], &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 2); // distinct pks: 1, 2 + /// An active/frozen memtable source whose PK index holds one row per id in + /// `ids` (positions 0..n), all committed and visible. + fn active_source(shard: Uuid, generation: u64, ids: &[i32]) -> LsmDataSource { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = id_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + LsmDataSource::ActiveMemTable { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: id_batch(&[1]).schema(), + shard_id: shard, + generation: LsmGeneration::memtable(generation), + } } - #[test] - fn empty_batches_yield_empty_membership() { - let hashes = pk_hashes_from_batches(&[id_batch(&[])], &["id".to_string()]).unwrap(); - assert!(hashes.is_empty()); + /// Whether `id`'s PK is blocked by any of a source's newer-gen memberships. + async fn blocks(memberships: &[GenMembership], id: i32) -> bool { + let key = on_disk_pk_key(&[ScalarValue::Int32(Some(id))]).unwrap(); + for m in memberships { + if m.contains(&key).await.unwrap() { + return true; + } + } + false } #[test] - fn batch_store_membership_collapses_within_gen_dups() { - let store = BatchStore::with_capacity(8); - // Two single-row batches, both pk=1 (a within-gen update). - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[1])).unwrap(); - // A two-row batch: pk=2, pk=3. - store.append(id_batch(&[2, 3])).unwrap(); - - let hashes = pk_hashes_from_batch_store(&store, &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 3); // distinct pks: 1, 2, 3 + fn on_disk_key_is_typed_for_single_and_binary_for_composite() { + // Single-column → the typed value; composite → encoded Binary. + let single = [ScalarValue::Int32(Some(7))]; + assert_eq!( + on_disk_pk_key(&single).unwrap(), + ScalarValue::Int32(Some(7)) + ); + let composite = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + assert!(matches!( + on_disk_pk_key(&composite).unwrap(), + ScalarValue::Binary(Some(_)) + )); } #[tokio::test] - async fn fresh_tier_block_list_one_set_per_in_memory_gen() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - + async fn fresh_tier_block_list_one_membership_per_in_memory_gen() { let shard = Uuid::new_v4(); - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; // Active gen 2: pk=1,2. Frozen gen 1: pk=3. - let sources = vec![mk(&[1, 2], 2), mk(&[3], 1)]; + let sources = vec![ + active_source(shard, 2, &[1, 2]), + active_source(shard, 1, &[3]), + ]; - let sets = fresh_tier_block_list(&sources, &["id".to_string()], None, None) - .await - .unwrap(); + let memberships = fresh_tier_block_list(&sources, None, None).await.unwrap(); - // One set per generation; together they cover pk=1,2,3 (not 4). - assert_eq!(sets.len(), 2); + // One membership per generation; together they cover pk=1,2,3 (not 4). + assert_eq!(memberships.len(), 2); for id in [1, 2, 3] { - assert!(blocks(&sets, id)); + assert!(blocks(&memberships, id).await); } - assert!(!blocks(&sets, 4)); + assert!(!blocks(&memberships, 4).await); } #[tokio::test] async fn block_lists_suppress_stale_across_in_memory_gens() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - let shard = Uuid::new_v4(); - let mk = |batches: &[&[i32]], generation: u64| { - let store = BatchStore::with_capacity(8); - for ids in batches { - store.append(id_batch(ids)).unwrap(); - } - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Frozen gen 1: stale pk=1. - // Active gen 2: pk=1 re-written, pk=2 new. - let sources = vec![mk(&[&[1]], 1), mk(&[&[1], &[2]], 2)]; + // Frozen gen 1: stale pk=1. Active gen 2: pk=1 re-written, pk=2 new. + let sources = vec![ + active_source(shard, 1, &[1]), + active_source(shard, 2, &[1, 2]), + ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // The newer active write supersedes the frozen copy: gen 1 is blocked on // pk=1, so its KNN drops pk=1. - assert!(blocks(&blocked[&(Some(shard), g1)], 1)); + assert!(blocks(&blocked[&(Some(shard), g1)], 1).await); // The active (newest) generation is superseded by nothing — no entry. assert!(!blocked.contains_key(&(Some(shard), g2))); } #[tokio::test] async fn block_lists_suppress_stale_base_row() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; use crate::dataset::{Dataset, WriteParams}; use arrow_array::RecordBatchIterator; - use uuid::Uuid; // Base (gen 0): pk=1 (stale), pk=3 (live). let base_batch = id_batch(&[1, 3]); @@ -384,89 +496,101 @@ mod tests { ); // Active gen 1: pk=1 re-written, pk=2 new. - let store = BatchStore::with_capacity(8); - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[2])).unwrap(); - let sources = vec![ LsmDataSource::BaseTable { dataset: base }, - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema, - shard_id: Uuid::new_v4(), - generation: LsmGeneration::memtable(1), - }, + active_source(Uuid::new_v4(), 1, &[1, 2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); // Base is blocked by every newer gen: pk=1 (re-written in gen 1) is - // blocked, pk=3 (base-only) is not. End-to-end drop: vector_search specs. + // blocked, pk=3 (base-only) is not. let base_blocked = blocked .get(&(None, LsmGeneration::BASE_TABLE)) .expect("base has a blocked set"); - assert!(blocks(base_blocked, 1)); - assert!(!blocks(base_blocked, 3)); + assert!(blocks(base_blocked, 1).await); + assert!(!blocks(base_blocked, 3).await); } #[tokio::test] async fn block_lists_are_keyed_per_shard() { // Regression: generations are per-shard, so a source must only be blocked - // by newer generations of its OWN shard. A generation-only key would - // cross-block same-generation sources from different shards. - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - - let mk = |shard: Uuid, ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). - // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). + // by newer generations of its OWN shard. let a = Uuid::new_v4(); let b = Uuid::new_v4(); + // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). + // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). let sources = vec![ - mk(a, &[1], 1), - mk(a, &[1], 2), - mk(b, &[2], 1), - mk(b, &[2], 2), + active_source(a, 1, &[1]), + active_source(a, 2, &[1]), + active_source(b, 1, &[2]), + active_source(b, 2, &[2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // Each shard's gen 1 is blocked by its OWN gen 2 only. - assert!(blocks(&blocked[&(Some(a), g1)], 1)); - assert!(!blocks(&blocked[&(Some(a), g1)], 2)); - assert!(blocks(&blocked[&(Some(b), g1)], 2)); - assert!(!blocks(&blocked[&(Some(b), g1)], 1)); + assert!(blocks(&blocked[&(Some(a), g1)], 1).await); + assert!(!blocks(&blocked[&(Some(a), g1)], 2).await); + assert!(blocks(&blocked[&(Some(b), g1)], 2).await); + assert!(!blocks(&blocked[&(Some(b), g1)], 1).await); // The newest generation of each shard is superseded by nothing. assert!(!blocked.contains_key(&(Some(a), g2))); assert!(!blocked.contains_key(&(Some(b), g2))); } + + #[tokio::test] + async fn index_membership_is_snapshot_bounded() { + // The index-sourced membership only counts a PK whose version is visible + // at the source's watermark, so a newer generation's not-yet-visible + // write can't shadow an older generation's visible copy. + let shard = Uuid::new_v4(); + let schema = id_batch(&[1]).schema(); + + // Older frozen gen 1: pk=1. + let g1 = active_source(shard, 1, &[1]); + + // Newer active gen 2: pk=99 visible at position 0, then pk=1 written at + // position 1 but with the watermark left at batch 0 (so pk=1 is in the + // index yet not visible) — the concurrent-write race. + let g2_store = BatchStore::with_capacity(8); + let mut g2_index = IndexStore::new(); + g2_index.enable_pk_index(&[("id".to_string(), 0)]); + let b0 = id_batch(&[99]); + let (bp0, off0, _) = g2_store.append(b0.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b0, off0, Some(bp0)) // advances watermark to 0 + .unwrap(); + let b1 = id_batch(&[1]); + let (_, off1, _) = g2_store.append(b1.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b1, off1, None) // index updated, watermark unchanged + .unwrap(); + let g2 = LsmDataSource::ActiveMemTable { + batch_store: Arc::new(g2_store), + index_store: Arc::new(g2_index), + schema, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }; + + let blocked = Box::pin(compute_source_block_lists(&[g1, g2], None, None)) + .await + .unwrap(); + + let g1_block = &blocked[&(Some(shard), LsmGeneration::memtable(1))]; + // pk=99 is visible in gen 2 → it blocks gen 1's pk=99. + assert!(blocks(g1_block, 99).await); + // pk=1's only gen-2 copy is not yet visible → it must NOT shadow gen 1. + assert!( + !blocks(g1_block, 1).await, + "a not-yet-visible newer write must not shadow an older visible copy" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index 508605c4642..1ab0950baf8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -476,21 +476,34 @@ impl LsmScanner { /// [`FlushedMemTableCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { let sources = self.build_collector().collect()?; - let sets = super::block_list::fresh_tier_block_list( + let memberships = super::block_list::fresh_tier_block_list( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), ) .await?; let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns) .map_err(|e| Error::invalid_input(e.to_string()))?; - Ok((0..pks.num_rows()) - .map(|row| { - let hash = super::exec::compute_pk_hash(pks, &pk_indices, row); - sets.iter().any(|set| set.contains(&hash)) - }) - .collect()) + let mut contained = Vec::with_capacity(pks.num_rows()); + for row in 0..pks.num_rows() { + // Both in-memory and flushed generations probe by the same key (the + // typed value, or the encoded `Binary` tuple for a composite PK). + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) + .collect::>() + .map_err(|e| Error::invalid_input(e.to_string()))?; + let key = super::block_list::on_disk_pk_key(&values)?; + let mut found = false; + for membership in &memberships { + if membership.contains(&key).await? { + found = true; + break; + } + } + contained.push(found); + } + Ok(contained) } /// Build the data source collector. @@ -607,10 +620,14 @@ mod tests { }; let mk = |ids: &[i32], generation: u64| { let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + let b = id_batch(ids); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); InMemoryMemTableRef { batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), + index_store: Arc::new(index), schema: schema.clone(), generation, } diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec.rs b/rust/lance/src/dataset/mem_wal/scanner/exec.rs index 88fd617dc0a..115cffccc81 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec.rs @@ -9,22 +9,22 @@ //! - [`MemtableGenTagExec`]: Wraps a scan to add `_memtable_gen` column //! - [`BloomFilterGuardExec`]: Guards child execution with bloom filter check //! - [`CoalesceFirstExec`]: Returns first non-empty result with short-circuit -//! - [`WithinSourceDedupExec`]: Deduplicates rows with the same PK from a single source -//! - [`PkHashFilterExec`]: Drops rows whose PK hash was superseded by a newer generation (the cross-generation block-list) +//! - [`PkBlockFilterExec`]: Drops rows whose PK was superseded by a newer generation (the cross-generation block-list) +//! - [`NewestPkFilterExec`]: Drops active-memtable hits that aren't the newest visible version of their PK (the within-source recency filter) mod bloom_guard; mod coalesce_first; mod generation_tag; +mod newest_pk_filter; mod pk; -mod pk_hash_filter; -mod within_source_dedup; +mod pk_block_filter; pub use bloom_guard::{BloomFilterGuardExec, compute_pk_hash_from_scalars}; pub use coalesce_first::CoalesceFirstExec; pub use generation_tag::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec}; +pub use newest_pk_filter::NewestPkFilterExec; pub use pk::{ ROW_ADDRESS_COLUMN, compute_pk_hash, is_supported_pk_type, resolve_pk_indices, validate_pk_types, }; -pub use pk_hash_filter::PkHashFilterExec; -pub use within_source_dedup::{DedupDirection, WithinSourceDedupExec}; +pub use pk_block_filter::PkBlockFilterExec; diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs new file mode 100644 index 00000000000..e1495cb0bb1 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop predicate-crossing stale rows from an active-memtable index search. +//! +//! The active memtable's HNSW / inverted index are append-only, so an updated +//! row's old entries stay live. When an update moves a row out of the query's +//! match set, the fresh version isn't in the index result, so a result-set +//! dedup (keep-newest among the returned rows) has nothing to suppress the +//! stale version against — and it leaks. +//! +//! This node closes that hole with a predicate-independent recency check: for +//! each hit it asks the memtable's maintained primary-key index +//! ([`IndexStore::pk_is_newest`]) whether the hit's own row position is the +//! newest version of its primary key visible at the query's `max_visible` +//! watermark, and keeps the hit **iff so**. A stale hit (some +//! newer version exists) is dropped even when that newer version never appears +//! in the result. This is exactly the seek point-lookup already does; the index +//! search arms simply didn't do it. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::{Stream, StreamExt}; + +use super::pk::resolve_pk_indices; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + +/// Keeps only the index hits that are the newest visible version of their PK. +/// +/// The input must expose all `pk_columns` and the `row_id_column` (`UInt64`, +/// the BatchStore row position). The output schema is unchanged. +pub struct NewestPkFilterExec { + input: Arc, + pk_columns: Vec, + row_id_column: String, + /// Holds the maintained primary-key index, queried per hit via + /// [`IndexStore::pk_is_newest`]. + index_store: Arc, + /// Resolves the `max_visible` row watermark from the visible batch prefix. + batch_store: Arc, + /// The MVCC batch-position snapshot the index search latched. Captured once + /// at plan time and shared with the search so the recency check keys on the + /// same snapshot the hits came from. + max_visible_batch_position: usize, + properties: Arc, +} + +impl fmt::Debug for NewestPkFilterExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // `BatchStore` / `IndexStore` aren't `Debug`; show only the knobs. + f.debug_struct("NewestPkFilterExec") + .field("pk_columns", &self.pk_columns) + .field("row_id_column", &self.row_id_column) + .field( + "max_visible_batch_position", + &self.max_visible_batch_position, + ) + .finish() + } +} + +impl NewestPkFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + row_id_column: impl Into, + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + row_id_column: row_id_column.into(), + index_store, + batch_store, + max_visible_batch_position, + properties, + } + } + + /// The inclusive max visible row position for this snapshot, or `None` when + /// no rows are visible. + fn max_visible_row(&self) -> Option { + self.batch_store + .max_visible_row(self.max_visible_batch_position) + } +} + +impl DisplayAs for NewestPkFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "NewestPkFilterExec: pk=[{}], row_id={}, max_visible_batch={}", + self.pk_columns.join(", "), + self.row_id_column, + self.max_visible_batch_position, + ) + } + } + } +} + +impl ExecutionPlan for NewestPkFilterExec { + fn name(&self) -> &str { + "NewestPkFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "NewestPkFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.row_id_column.clone(), + self.index_store.clone(), + self.batch_store.clone(), + self.max_visible_batch_position, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(NewestPkFilterStream { + input: input_stream, + pk_columns: self.pk_columns.clone(), + row_id_column: self.row_id_column.clone(), + index_store: self.index_store.clone(), + max_visible_row: self.max_visible_row(), + schema: self.schema(), + })) + } +} + +struct NewestPkFilterStream { + input: SendableRecordBatchStream, + pk_columns: Vec, + row_id_column: String, + index_store: Arc, + /// Inclusive watermark snapshot; `None` when no rows are visible. + max_visible_row: Option, + schema: SchemaRef, +} + +impl NewestPkFilterStream { + fn filter_batch(&self, batch: RecordBatch) -> DFResult { + // No primary-key index (memtable without a primary key), no visible + // rows, or an empty batch: nothing to dedup against, so pass it through. + if !self.index_store.has_pk_index() { + return Ok(batch); + } + let Some(max_visible_row) = self.max_visible_row else { + return Ok(batch); + }; + if batch.num_rows() == 0 { + return Ok(batch); + } + + let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; + let row_ids = batch + .column_by_name(&self.row_id_column) + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' not found in NewestPkFilterExec input", + self.row_id_column + )) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' is not UInt64", + self.row_id_column + )) + })?; + + let mut keep = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + // A null row position can't be ordered; keep it rather than guess + // (callers always project a real position here). + if row_ids.is_null(row) { + keep.push(true); + continue; + } + let position = row_ids.value(row); + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + // Keep iff this hit is the newest visible version of its PK. + keep.push( + self.index_store + .pk_is_newest(&values, position, max_visible_row), + ); + } + filter_record_batch(&batch, &BooleanArray::from(keep)) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } +} + +impl Stream for NewestPkFilterStream { + type Item = DFResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.filter_batch(batch))), + other => other, + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for NewestPkFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + /// Single-column `id` PK batch, one per append so a caller can control + /// row-level visibility via `max_visible_batch_position`. + fn id_batch(id: i32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![id]))]).unwrap() + } + + /// Index-search "hits": `(id, _rowid)` pairs the filter evaluates. + fn hits(rows: &[(i32, u64)]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new(lance_core::ROW_ID, DataType::UInt64, true), + ])); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + let rowids: Vec = rows.iter().map(|(_, p)| *p).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(UInt64Array::from(rowids)), + ], + ) + .unwrap() + } + + /// Build an active memtable whose PK index + BatchStore hold one row per + /// `id` in `appended` (positions 0..n), all committed. + fn active(appended: &[i32]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in appended { + let b = id_batch(id); + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + (Arc::new(index), batch_store) + } + + async fn run( + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + hits_batch: RecordBatch, + ) -> Vec<(i32, u64)> { + let input = + TestMemoryExec::try_new_exec(&[vec![hits_batch.clone()]], hits_batch.schema(), None) + .unwrap(); + let exec = NewestPkFilterExec::new( + input, + vec!["id".to_string()], + lance_core::ROW_ID, + index_store, + batch_store, + max_visible_batch_position, + ); + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + let mut rows = Vec::new(); + for b in &out { + let ids = b.column(0).as_any().downcast_ref::().unwrap(); + let pos = b.column(1).as_any().downcast_ref::().unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), pos.value(i))); + } + } + rows + } + + #[tokio::test] + async fn keeps_only_the_newest_visible_position_per_pk() { + // id=1 written at positions 0 and 2 (an update), id=2 at position 1; all + // visible. A stale hit (id=1 @ 0) is dropped; the newest (id=1 @ 2) and + // the unrelated id=2 survive — even though all three were "returned" by + // the index search. + let (index, store) = active(&[1, 2, 1]); + let rows = run(index, store, 2, hits(&[(1, 0), (2, 1), (1, 2)])).await; + assert_eq!(rows, vec![(2, 1), (1, 2)]); + } + + #[tokio::test] + async fn does_not_vanish_a_visible_row_under_a_newer_invisible_write() { + // The store/index hold id=1 at positions 0 and 2, but the query latched + // `max_visible_batch_position = 0` (only position 0 visible) — i.e. the + // update at position 2 was committed *after* this query's snapshot. The + // visible older row (id=1 @ 0) must be KEPT (its newest *visible* version + // is itself), not dropped because of the not-yet-visible position 2. + let (index, store) = active(&[1, 2, 1]); + let kept = run(index.clone(), store.clone(), 0, hits(&[(1, 0)])).await; + assert_eq!(kept, vec![(1, 0)], "visible row must not vanish"); + + // And the not-yet-visible position is itself dropped (outside snapshot). + let dropped = run(index, store, 0, hits(&[(1, 2)])).await; + assert!( + dropped.is_empty(), + "row beyond the snapshot must be dropped" + ); + } + + #[tokio::test] + async fn passes_through_when_no_pk_index() { + // A memtable without a primary-key index can't be deduped here, so the + // filter is a pass-through rather than dropping everything. + let batch_store = Arc::new(BatchStore::with_capacity(16)); + batch_store.append(id_batch(1)).unwrap(); + let index = Arc::new(IndexStore::new()); // no enable_pk_index + let rows = run(index, batch_store, 0, hits(&[(1, 0), (1, 9)])).await; + assert_eq!(rows, vec![(1, 0), (1, 9)]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs index 523dd30bf82..0707eb5e8dd 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs @@ -4,7 +4,7 @@ //! Shared primary-key helpers for the LSM scanner execution nodes. //! //! Centralizes PK column resolution and per-row hashing so that every -//! consumer (e.g. [`super::WithinSourceDedupExec`], [`super::PkHashFilterExec`]) +//! consumer (e.g. [`super::PkBlockFilterExec`], [`super::NewestPkFilterExec`]) //! resolves and hashes a primary key the same way. The row hash is kept //! consistent with the variants supported by [`super::compute_pk_hash_from_scalars`] //! so a single PK produces the same hash regardless of which exec consumes it. diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs new file mode 100644 index 00000000000..c5b8f959d26 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop superseded rows from a per-source result by primary-key membership. +//! +//! Drops a row when any newer generation's membership ([`GenMembership`]) +//! contains its primary key — in-memory generations probe their PK index by +//! value, flushed generations probe their on-disk PK BTree. Each generation is +//! probed once per batch (see the perf note below). Used both as the KNN +//! post-filter (vector search, with over-fetch) and the cross-generation scan +//! filter (`k = 0`). +//! +//! Cross-generation only: within-gen duplicates collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. +//! +//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns +//! when a source had >= k candidates but < k survived (over-fetch too small). +//! +//! Perf note: each generation is probed once per batch via +//! [`GenMembership::contains_keys`] — a batched existence check over the +//! batch's keys — not once per row. The on-disk arm issues a single +//! `BTreeIndex::contains_keys` (one page pass, no per-key `SearchResult` +//! allocation); the in-memory arm maps a sync PK lookup over the keys. Probes +//! are not disk-bound in steady state: the opened index and its (small, +//! memtable-sized) pages are held by the injected `FlushedMemTableCache` / +//! `LanceCache`, so after the first touch every probe is memory-resident. +//! Already-blocked rows are dropped from the key set before probing older +//! generations, preserving the per-row short-circuit. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, StreamExt}; +use tracing::warn; + +use super::super::block_list::{GenMembership, on_disk_pk_key}; +use super::pk::resolve_pk_indices; + +/// Filters out rows whose PK is contained in any newer generation's membership. +#[derive(Debug)] +pub struct PkBlockFilterExec { + input: Arc, + pk_columns: Vec, + /// Newer generations' membership; a row is blocked if any contains its PK. + blocked: Vec, + /// Target neighbor count, used only to warn on a per-source under-fetch. + k: usize, + properties: Arc, +} + +impl PkBlockFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + blocked: Vec, + k: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + blocked, + k, + properties, + } + } +} + +impl DisplayAs for PkBlockFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "PkBlockFilterExec: pk_cols=[{}], gens={}", + self.pk_columns.join(", "), + self.blocked.len(), + ) + } + } + } +} + +impl ExecutionPlan for PkBlockFilterExec { + fn name(&self) -> &str { + "PkBlockFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "PkBlockFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.blocked.clone(), + self.k, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(PkBlockFilterStream { + input: input_stream, + config: Arc::new(FilterConfig { + pk_columns: self.pk_columns.clone(), + blocked: self.blocked.clone(), + }), + k: self.k, + schema: self.schema(), + pending: None, + input_seen: 0, + kept: 0, + warned: false, + })) + } +} + +/// Immutable per-stream filter config. Shared into each batch's `'static` async +/// future by a single `Arc` clone, rather than deep-cloning the PK columns and +/// memberships per batch. +struct FilterConfig { + pk_columns: Vec, + blocked: Vec, +} + +struct PkBlockFilterStream { + input: SendableRecordBatchStream, + config: Arc, + k: usize, + schema: SchemaRef, + /// The in-flight filter for the batch currently being processed (the probe + /// is async, so a batch is filtered off-poll and resumed here). + pending: Option>>, + input_seen: usize, + kept: usize, + warned: bool, +} + +/// Keep only the rows no newer-gen membership contains. Async because flushed +/// generations are probed against their on-disk PK BTree. +async fn filter_batch(batch: RecordBatch, config: Arc) -> DFResult { + let FilterConfig { + pk_columns, + blocked, + } = config.as_ref(); + if blocked.is_empty() || batch.num_rows() == 0 { + return Ok(batch); + } + let pk_indices = resolve_pk_indices(&batch, pk_columns)?; + let to_df = |e: lance_core::Error| DataFusionError::Execution(e.to_string()); + + // One key per row, in the index key space. + let keys: Vec = (0..batch.num_rows()) + .map(|row| { + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + on_disk_pk_key(&values).map_err(to_df) + }) + .collect::>()?; + + // A row is dropped if any newer generation contains its key. Probe each + // generation once (batched) rather than once per row, narrowing to the + // still-live rows so an already-blocked row isn't re-probed against older + // generations. + let mut blocked_row = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in blocked { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await.map_err(to_df)?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + blocked_row[row] = true; + } else { + next_live.push(row); + } + } + live = next_live; + } + + let keep = BooleanArray::from_iter(blocked_row.into_iter().map(|b| Some(!b))); + filter_record_batch(&batch, &keep).map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) +} + +impl Stream for PkBlockFilterStream { + type Item = DFResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + // Drive an in-flight filter to completion before pulling more input. + if let Some(fut) = this.pending.as_mut() { + return match fut.as_mut().poll(cx) { + Poll::Ready(Ok(out)) => { + this.pending = None; + this.kept += out.num_rows(); + Poll::Ready(Some(Ok(out))) + } + Poll::Ready(Err(e)) => { + this.pending = None; + Poll::Ready(Some(Err(e))) + } + Poll::Pending => Poll::Pending, + }; + } + + match this.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + this.input_seen += batch.num_rows(); + this.pending = Some(filter_batch(batch, this.config.clone()).boxed()); + // Loop to poll the just-created future. + } + Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))), + Poll::Ready(None) => { + // >= k candidates in, < k out: over-fetch missed superseded rows. + if !this.warned && this.input_seen >= this.k && this.kept < this.k { + warn!( + k = this.k, + fetched = this.input_seen, + kept = this.kept, + "LSM vector search: < k live rows survived the PK post-filter; \ + raise the over-fetch factor or use a true KNN prefilter." + ); + this.warned = true; + } + return Poll::Ready(None); + } + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for PkBlockFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + fn int_batch(ids: &[i32]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An in-memory membership whose PK index holds `ids` (positions 0..n). + fn membership(ids: &[i32]) -> GenMembership { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = int_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + let max_visible_row = store.max_visible_row(index.max_visible_batch_position()); + GenMembership::InMemory { + index_store: Arc::new(index), + max_visible_row, + } + } + + async fn run(exec: PkBlockFilterExec) -> Vec { + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + out.iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect() + } + + #[tokio::test] + async fn drops_rows_blocked_by_a_newer_generation() { + let b = int_batch(&[10, 20, 30]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = + PkBlockFilterExec::new(input, vec!["id".to_string()], vec![membership(&[20])], 1); + assert_eq!(run(exec).await, vec![10, 30]); + } + + #[tokio::test] + async fn blocks_a_pk_present_in_any_generation() { + // Two newer-gen memberships: a row is dropped if either contains its PK. + let b = int_batch(&[10, 20, 30]); + let blocked = vec![membership(&[10]), membership(&[30])]; + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], blocked, 1); + assert_eq!(run(exec).await, vec![20]); + } + + #[tokio::test] + async fn empty_blocked_keeps_all_rows() { + let b = int_batch(&[1, 2, 3]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); + assert_eq!(run(exec).await, vec![1, 2, 3]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs deleted file mode 100644 index ee473047d01..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs +++ /dev/null @@ -1,350 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Drop superseded rows from a per-source KNN result by primary-key hash. -//! -//! Drops a row when its PK hash ([`super::compute_pk_hash`]) is in any `blocked` -//! set — the newer generations' membership (`Arc`, shared, never merged; -//! base table: all generations). Only the KNN output is hashed. -//! -//! Cross-generation only: within-gen duplicates share a hash, so the global -//! dedup's `(generation, freshness)` tiebreaker collapses those instead. -//! -//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns -//! when a source had >= k candidates but < k survived (over-fetch too small). - -use std::any::Any; -use std::collections::HashSet; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow::compute::filter_record_batch; -use arrow_array::{BooleanArray, RecordBatch}; -use arrow_schema::SchemaRef; -use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt}; -use tracing::warn; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Filters out rows whose PK hash is in any set of `blocked`. -#[derive(Debug)] -pub struct PkHashFilterExec { - input: Arc, - pk_columns: Vec, - /// Newer generations' membership; a row is blocked if any set holds its hash. - blocked: Vec>>, - /// Target neighbor count, used only to warn on a per-source under-fetch. - k: usize, - properties: Arc, -} - -impl PkHashFilterExec { - pub fn new( - input: Arc, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - ) -> Self { - // A filter preserves the input schema and partitioning. - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(input.schema()), - input.output_partitioning().clone(), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - blocked, - k, - properties, - } - } -} - -impl DisplayAs for PkHashFilterExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - let total: usize = self.blocked.iter().map(|s| s.len()).sum(); - write!( - f, - "PkHashFilterExec: pk_cols=[{}], gens={}, blocked={}", - self.pk_columns.join(", "), - self.blocked.len(), - total, - ) - } - } - } -} - -impl ExecutionPlan for PkHashFilterExec { - fn name(&self) -> &str { - "PkHashFilterExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input.schema() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(DataFusionError::Internal( - "PkHashFilterExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.blocked.clone(), - self.k, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(PkHashFilterStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - blocked: self.blocked.clone(), - k: self.k, - schema: self.schema(), - input_seen: 0, - kept: 0, - warned: false, - })) - } -} - -struct PkHashFilterStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - schema: SchemaRef, - input_seen: usize, - kept: usize, - warned: bool, -} - -impl PkHashFilterStream { - fn filter_batch(&self, batch: RecordBatch) -> DFResult { - if self.blocked.is_empty() || batch.num_rows() == 0 { - return Ok(batch); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let keep: BooleanArray = (0..batch.num_rows()) - .map(|row| { - let hash = compute_pk_hash(&batch, &pk_indices, row); - !self.blocked.iter().any(|set| set.contains(&hash)) - }) - .collect(); - filter_record_batch(&batch, &keep) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for PkHashFilterStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match self.input.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(batch))) => { - self.input_seen += batch.num_rows(); - match self.filter_batch(batch) { - Ok(out) => { - self.kept += out.num_rows(); - Poll::Ready(Some(Ok(out))) - } - Err(e) => Poll::Ready(Some(Err(e))), - } - } - Poll::Ready(None) => { - // >= k candidates in, < k out: the over-fetch missed superseded rows. - if !self.warned && self.input_seen >= self.k && self.kept < self.k { - warn!( - k = self.k, - fetched = self.input_seen, - kept = self.kept, - "LSM vector search: < k live rows survived the PK-hash post-filter; \ - raise the over-fetch factor or use a true KNN prefilter." - ); - self.warned = true; - } - Poll::Ready(None) - } - other => other, - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for PkHashFilterStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - /// Hash a single-column Int32 PK value the way the exec does, so a test can - /// build blocked sets from values rather than hand-computed hashes. - fn hash_int_pk(id: i32) -> u64 { - let batch = int_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - fn int_batch(ids: &[i32]) -> RecordBatch { - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() - } - - fn blocked(ids: &[i32]) -> Vec>> { - vec![Arc::new(ids.iter().map(|&id| hash_int_pk(id)).collect())] - } - - async fn run(exec: PkHashFilterExec) -> Vec { - let ctx = SessionContext::new(); - let out: Vec = exec - .execute(0, ctx.task_ctx()) - .unwrap() - .try_collect() - .await - .unwrap(); - out.iter() - .flat_map(|b| { - b.column_by_name("id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - }) - .collect() - } - - #[tokio::test] - async fn drops_rows_with_blocked_pk_hash() { - let b = int_batch(&[10, 20, 30]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], blocked(&[20]), 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn blocks_a_pk_present_in_any_generation_set() { - // Two newer-gen sets: a row is dropped if either contains its PK. - let b = int_batch(&[10, 20, 30]); - let sets = vec![ - Arc::new(HashSet::from([hash_int_pk(10)])), - Arc::new(HashSet::from([hash_int_pk(30)])), - ]; - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], sets, 1); - assert_eq!(run(exec).await, vec![20]); - } - - #[tokio::test] - async fn empty_blocked_keeps_all_rows() { - let b = int_batch(&[1, 2, 3]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); - assert_eq!(run(exec).await, vec![1, 2, 3]); - } - - #[tokio::test] - async fn null_pk_is_hashed_consistently_and_blockable() { - // A null PK hashes deterministically (compute_pk_hash hashes is_null), - // so a superseded null-key row can be dropped like any other. - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)])); - let with_null = |ids: Vec>| { - RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap() - }; - let pk = vec!["id".to_string()]; - let null_row = with_null(vec![None]); - let pk_indices = resolve_pk_indices(&null_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &null_row, - &pk_indices, - 0, - )]))]; - - // Rows: 10, NULL, 30 — only the NULL-key row is dropped. - let b = with_null(vec![Some(10), None, Some(30)]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn composite_pk_hash_matches_block_set() { - // Composite PK (id, name): block the (2, "b") tuple only. - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); - let mk = |ids: &[i32], names: &[&str]| { - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - ], - ) - .unwrap() - }; - let pk = vec!["id".to_string(), "name".to_string()]; - let one_row = mk(&[2], &["b"]); - let pk_indices = resolve_pk_indices(&one_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &one_row, - &pk_indices, - 0, - )]))]; - - // (1,"a") and (2,"a") survive; only the exact (2,"b") tuple is dropped. - let b = mk(&[1, 2, 2], &["a", "a", "b"]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![1, 2]); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs deleted file mode 100644 index be5dae6a668..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs +++ /dev/null @@ -1,432 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! WithinSourceDedupExec - Deduplicates rows with the same primary key from a -//! single LSM source, keeping the newest insert. -//! -//! In MemWAL/LSM mode the same primary key can be written multiple times into -//! the same memtable. The active memtable stores rows in insert order (larger -//! `_rowaddr` = newer), while flushed memtables are reverse-written so that -//! within a flushed file the smallest `_rowid` is the newest insert (see -//! `memtable/flush.rs:152` and `hnsw/storage.rs:307`). Point lookup uses this -//! node to collapse such duplicates *within a single source* so that the -//! downstream `CoalesceFirstExec` / `LIMIT` sees at most one row per primary -//! key per source. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow_array::{Array, RecordBatch, UInt64Array}; -use arrow_schema::SchemaRef; -use datafusion::error::Result as DFResult; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt, ready}; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Among rows that share a primary key, which row-address extreme identifies -/// the newest insert to keep. The kept row is always the freshest; only the -/// row address (`_rowaddr`/`_rowid`) used to find it differs by source. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DedupDirection { - /// Keep the row with the largest row-address value (active memtable: larger - /// `_rowaddr` = inserted later). - KeepMaxRowAddr, - /// Keep the row with the smallest row-address value (flushed memtable under - /// reverse-write: smaller `_rowid` = inserted later). - KeepMinRowAddr, -} - -/// Deduplicates rows from a single source by primary key, keeping the row -/// whose `row_addr_column` value wins per [`DedupDirection`]. -/// -/// # Required columns -/// -/// The input must expose: -/// - All `pk_columns` -/// - `row_addr_column` of `UInt64` type -/// -/// The output schema is unchanged from the input. Callers that need to hide -/// the row-address column from downstream consumers should compose this node -/// with `project_to_canonical` or `null_columns`. -/// -/// # Performance -/// -/// Memory: `O(unique primary keys in input)`. For point lookup the input is -/// already filtered to a single primary key so the map holds at most one -/// entry. -#[derive(Debug)] -pub struct WithinSourceDedupExec { - input: Arc, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - properties: Arc, -} - -impl WithinSourceDedupExec { - pub fn new( - input: Arc, - pk_columns: Vec, - row_addr_column: impl Into, - direction: DedupDirection, - ) -> Self { - let schema = input.schema(); - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema.clone()), - Partitioning::UnknownPartitioning(1), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - row_addr_column: row_addr_column.into(), - direction, - schema, - properties, - } - } - - pub fn pk_columns(&self) -> &[String] { - &self.pk_columns - } - - pub fn row_addr_column(&self) -> &str { - &self.row_addr_column - } - - pub fn direction(&self) -> DedupDirection { - self.direction - } -} - -impl DisplayAs for WithinSourceDedupExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - write!( - f, - "WithinSourceDedupExec: pk=[{}], row_addr={}, direction={:?}", - self.pk_columns.join(", "), - self.row_addr_column, - self.direction, - ) - } - } - } -} - -impl ExecutionPlan for WithinSourceDedupExec { - fn name(&self) -> &str { - "WithinSourceDedupExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(datafusion::error::DataFusionError::Internal( - "WithinSourceDedupExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.row_addr_column.clone(), - self.direction, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(WithinSourceDedupStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - row_addr_column: self.row_addr_column.clone(), - direction: self.direction, - schema: self.schema.clone(), - winners: HashMap::new(), - emitted: false, - })) - } -} - -/// One winning row, materialized as a single-row `RecordBatch` so we don't -/// have to keep the source batch alive after we've picked the winner. -struct Winner { - batch: RecordBatch, - row_addr: u64, -} - -struct WithinSourceDedupStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - winners: HashMap, - emitted: bool, -} - -impl WithinSourceDedupStream { - fn consume_batch(&mut self, batch: RecordBatch) -> DFResult<()> { - if batch.num_rows() == 0 { - return Ok(()); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let row_addr_array = batch - .column_by_name(&self.row_addr_column) - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' not found in batch", - self.row_addr_column - )) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' is not UInt64", - self.row_addr_column - )) - })?; - - for row_idx in 0..batch.num_rows() { - if row_addr_array.is_null(row_idx) { - // A NULL row address can't be ordered against a real one. Skip - // rather than guess — callers should always project a real - // row-address column for dedup-eligible sources. - continue; - } - let row_addr = row_addr_array.value(row_idx); - let pk_hash = compute_pk_hash(&batch, &pk_indices, row_idx); - - let take_row = match self.winners.get(&pk_hash) { - None => true, - Some(existing) => match self.direction { - DedupDirection::KeepMaxRowAddr => row_addr > existing.row_addr, - DedupDirection::KeepMinRowAddr => row_addr < existing.row_addr, - }, - }; - - if take_row { - let single = batch.slice(row_idx, 1); - self.winners.insert( - pk_hash, - Winner { - batch: single, - row_addr, - }, - ); - } - } - Ok(()) - } - - fn finalize(&mut self) -> DFResult { - if self.winners.is_empty() { - return Ok(RecordBatch::new_empty(self.schema.clone())); - } - let batches: Vec = self.winners.drain().map(|(_, w)| w.batch).collect(); - let batch_refs: Vec<&RecordBatch> = batches.iter().collect(); - arrow_select::concat::concat_batches(&self.schema, batch_refs) - .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for WithinSourceDedupStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - loop { - if self.emitted { - return Poll::Ready(None); - } - match ready!(self.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - if let Err(e) = self.consume_batch(batch) { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - } - Some(Err(e)) => { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - None => { - self.emitted = true; - return Poll::Ready(Some(self.finalize())); - } - } - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for WithinSourceDedupStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Float32Array, Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - fn create_test_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("_distance", DataType::Float32, true), - Field::new("_row_addr", DataType::UInt64, true), - ])) - } - - fn batch(ids: &[i32], names: &[&str], distances: &[f32], row_addr: &[u64]) -> RecordBatch { - let schema = create_test_schema(); - RecordBatch::try_new( - schema, - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - Arc::new(Float32Array::from(distances.to_vec())), - Arc::new(UInt64Array::from(row_addr.to_vec())), - ], - ) - .unwrap() - } - - async fn run(batches: Vec, direction: DedupDirection) -> Vec { - let schema = create_test_schema(); - let input = TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap(); - let exec = - WithinSourceDedupExec::new(input, vec!["id".to_string()], "_row_addr", direction); - let ctx = SessionContext::new(); - let stream = exec.execute(0, ctx.task_ctx()).unwrap(); - stream.try_collect().await.unwrap() - } - - fn extract(batches: &[RecordBatch]) -> Vec<(i32, String, u64)> { - let mut out = Vec::new(); - for b in batches { - let ids = b.column(0).as_any().downcast_ref::().unwrap(); - let names = b.column(1).as_any().downcast_ref::().unwrap(); - let addr = b.column(3).as_any().downcast_ref::().unwrap(); - for i in 0..b.num_rows() { - out.push((ids.value(i), names.value(i).to_string(), addr.value(i))); - } - } - out.sort_by_key(|(id, _, _)| *id); - out - } - - #[tokio::test] - async fn keep_max_picks_largest_row_addr() { - // Active-memtable case: same pk inserted twice; newer = larger _rowaddr. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[10, 99, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 99)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn keep_min_picks_smallest_row_addr() { - // Flushed-memtable case under reverse-write: newer = smaller _rowid. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[99, 10, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMinRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 10)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn dedup_across_batches() { - let b1 = batch(&[1, 2], &["a", "b"], &[0.1, 0.2], &[1, 1]); - let b2 = batch(&[1, 3], &["a_new", "c"], &[0.5, 0.4], &[7, 1]); - let out = run(vec![b1, b2], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 3); - assert_eq!(rows[0], (1, "a_new".to_string(), 7)); - assert_eq!(rows[1], (2, "b".to_string(), 1)); - assert_eq!(rows[2], (3, "c".to_string(), 1)); - } - - #[tokio::test] - async fn empty_input() { - let out = run(vec![], DedupDirection::KeepMaxRowAddr).await; - let total: usize = out.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 0); - } - - #[tokio::test] - async fn null_row_addr_skipped() { - // Rows with NULL row address can't be ordered — they're dropped so they - // don't accidentally become winners against real values. - let schema = create_test_schema(); - let b = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 1])), - Arc::new(StringArray::from(vec!["nulladdr", "real"])), - Arc::new(Float32Array::from(vec![0.1, 0.2])), - Arc::new(UInt64Array::from(vec![None, Some(5)])), - ], - ) - .unwrap(); - let out = run(vec![b], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0], (1, "real".to_string(), 5)); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs index 39abf7e8c71..0c2d3b039fe 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs @@ -41,12 +41,10 @@ use crate::session::Session; pub struct FlushedMemTableCache { // `moka`'s async cache gives a bounded size plus single-flight // `try_get_with`, so concurrent first-queries on a just-flushed - // generation open the dataset exactly once. + // generation open the dataset exactly once. The opened dataset carries the + // session index cache, which also backs each generation's standalone PK + // dedup index (see `block_list::open_pk_index`) — no separate cache path. inner: moka::future::Cache>, - // Per-generation set of PK hashes for the vector-search block-list, keyed by - // the same immutable flushed path. Built lazily on the first query that needs - // it (single-flight) so repeated searches skip re-scanning the PK column. - pk_hashes: moka::future::Cache>>, } impl FlushedMemTableCache { @@ -63,10 +61,6 @@ impl FlushedMemTableCache { // into at build time. .support_invalidation_closures() .build(), - pk_hashes: moka::future::Cache::builder() - .max_capacity(max_entries) - .support_invalidation_closures() - .build(), } } @@ -96,21 +90,6 @@ impl FlushedMemTableCache { .map_err(|e: Arc| Error::cloned(e.to_string())) } - /// Get the cached set of PK hashes for `path`, building it (exactly once) on - /// a miss via `build`. The flushed path is immutable, so a cached set is - /// never stale; concurrent first-queries share one build via `moka`'s - /// single-flight `try_get_with`. - pub async fn get_or_build_pk_hashes( - &self, - path: &str, - build: impl std::future::Future>>, - ) -> Result>> { - self.pk_hashes - .try_get_with(path.to_string(), async move { build.await.map(Arc::new) }) - .await - .map_err(|e: Arc| Error::cloned(e.to_string())) - } - /// Drop cached entries whose path is not in `live_paths`. /// /// Called by the consumer after compaction retires generations. Purely a @@ -125,10 +104,6 @@ impl FlushedMemTableCache { let _ = self .inner .invalidate_entries_if(move |path, _| !live.contains(path)); - let live = live_paths.clone(); - let _ = self - .pk_hashes - .invalidate_entries_if(move |path, _| !live.contains(path)); } } @@ -250,34 +225,6 @@ mod tests { assert_eq!(cache.inner.entry_count(), 1, "exactly one entry cached"); } - #[tokio::test] - async fn pk_hashes_cached_reuses_first_build() { - // The PK-hash set is keyed by the immutable flushed path: a hit returns - // the first-built set and never runs the second build closure. - let cache = FlushedMemTableCache::new(8); - let path = "memory://shard/gen_1"; - let first = cache - .get_or_build_pk_hashes(path, async { Ok(HashSet::from([1u64, 2])) }) - .await - .unwrap(); - let second = cache - .get_or_build_pk_hashes(path, async { - // Different contents; must be ignored because the path is cached. - Ok(HashSet::from([9u64])) - }) - .await - .unwrap(); - assert!( - Arc::ptr_eq(&first, &second), - "a PK-hash cache hit must reuse the first-built set" - ); - assert_eq!( - second.len(), - 2, - "cached set keeps the first build's contents" - ); - } - #[tokio::test] async fn test_retain_paths_drops_unreferenced() { let temp_dir = tempfile::tempdir().unwrap(); diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index 626b0effe3c..92298524535 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -44,7 +44,7 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::union::UnionExec; -use lance_core::{Error, ROW_ID, Result, is_system_column}; +use lance_core::{Error, Result, is_system_column}; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery; use tracing::instrument; @@ -52,7 +52,7 @@ use tracing::instrument; use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, PkHashFilterExec, WithinSourceDedupExec}; +use super::exec::{NewestPkFilterExec, PkBlockFilterExec}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; @@ -154,13 +154,12 @@ impl LsmFtsSearchPlanner { return self.empty_plan(&target_schema); } - // Per-source PK-hash block sets for cross-generation dedup (NEWER(G) - // per shard; base = union of all gens). Query-type-agnostic — same - // call the vector planner makes. `Box::pin` keeps the future off + // Per-source PK block sets for cross-generation dedup (NEWER(G) per + // shard; base = union of all gens). Query-type-agnostic — same call the + // vector planner makes. `Box::pin` keeps the future off // `clippy::large_futures`. let block_lists = Box::pin(compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -187,13 +186,9 @@ impl LsmFtsSearchPlanner { }) .collect(); let built = - futures::future::try_join_all(arm_inputs.iter().map( - |(source, is_active, _, fetch_k)| { - Box::pin(self.build_source_plan( - source, column, &query, *fetch_k, projection, *is_active, - )) - }, - )) + futures::future::try_join_all(arm_inputs.iter().map(|(source, _, _, fetch_k)| { + Box::pin(self.build_source_plan(source, column, &query, *fetch_k, projection)) + })) .await?; let mut per_source_plans: Vec> = Vec::with_capacity(sources.len()); @@ -201,20 +196,15 @@ impl LsmFtsSearchPlanner { let is_active = *is_active; let blocked = *blocked; // Dedup, mirroring LsmVectorSearchPlanner: - // * active: collapse duplicate-PK appends to the newest insert - // (larger _rowid = inserted later). The FTS index is append-only, - // so an in-memtable update leaves both versions searchable. + // * active: already wrapped in `NewestPkFilterExec` inside + // `build_source_plan` (drops predicate-crossing stale hits, which a + // result-set dedup can't catch). // * flushed/base: drop rows superseded by a newer generation via the // block-list (within-gen is handled by the flushed deletion vector). let deduped = if is_active { - Arc::new(WithinSourceDedupExec::new( - plan, - self.pk_columns.clone(), - ROW_ID, - DedupDirection::KeepMaxRowAddr, - )) as Arc + plan } else if let Some(set) = blocked { - Arc::new(PkHashFilterExec::new( + Arc::new(PkBlockFilterExec::new( plan, self.pk_columns.clone(), set.clone(), @@ -282,7 +272,6 @@ impl LsmFtsSearchPlanner { query: &FullTextSearchQuery, k: usize, projection: Option<&[String]>, - emit_row_id: bool, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -320,11 +309,12 @@ impl LsmFtsSearchPlanner { MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone()); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); - // Emit `_rowid` (row position) so the planner can collapse - // duplicate-PK appends via WithinSourceDedupExec before the union. - if emit_row_id { - scanner.with_row_id(); - } + // Expose the row position so the recency filter can identify the + // newest visible version of each PK. The append-only inverted + // index keeps an updated row's old postings live, so a stale hit + // can match a query the fresh row no longer does; the filter + // drops it. `project_to_canonical` strips `_rowid` afterward. + scanner.with_row_id(); // `MemTableScanner::full_text_search` takes a raw match // string; richer query shapes (phrase/boolean/fuzzy) can // be plumbed through once the MemTable scanner accepts a @@ -343,7 +333,19 @@ impl LsmFtsSearchPlanner { // today; the per-partition Sort+fetch above bounds the // emitted rows. let _ = k; - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Drop predicate-crossing stale hits: keep a hit iff it is the + // newest visible version of its PK (collapses duplicate-PK + // appends too — supersedes the old WithinSourceDedupExec). + let filtered: Arc = Arc::new(NewestPkFilterExec::new( + plan, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store.clone(), + batch_store.clone(), + scanner.max_visible_batch_position(), + )); + Ok(filtered) } } } @@ -497,6 +499,7 @@ mod tests { // Active memtable with its own FTS index, containing a matching row. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); let active_batch = make_batch( &schema, @@ -665,6 +668,7 @@ mod tests { let schema = fts_schema(); let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); // First append (positions 0,1): id=1 is the stale version of the PK. @@ -744,4 +748,88 @@ mod tests { "dedup must keep the newest (max row-position) version" ); } + + #[tokio::test] + async fn active_stale_update_predicate_crossing_leaks() { + // A PK update that crosses out of the match set: pk=1 inserted as + // "alpha lance", then updated to "beta lance". The append-only inverted + // index keeps the old "alpha" posting live, so an "alpha" search still + // matches the STALE pk=1 row — and the fresh "beta lance" row isn't even + // a candidate, so a result-set dedup has nothing to suppress it against. + // `NewestPkFilterExec` drops it predicate-independently: pk=1's newest + // visible row is "beta lance", so the "alpha" hit is not the newest. + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // Insert pk=1 ("alpha lance") and an unrelated live pk=2 ("alpha foo"). + let b1 = make_batch(&schema, &[1, 2], &["alpha lance", "alpha foo"]); + let (bp1, off1, _) = batch_store.append(b1.clone()).unwrap(); + indexes + .insert_with_batch_position(&b1, off1, Some(bp1)) + .unwrap(); + + // Update pk=1 → "beta lance" (no longer matches "alpha"). + let b2 = make_batch(&schema, &[1], &["beta lance"]); + let (bp2, off2, _) = batch_store.append(b2.clone()).unwrap(); + indexes + .insert_with_batch_position(&b2, off2, Some(bp2)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("alpha".to_string()), + 10, + None, + ) + .await + .expect("planner should produce a plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut ids: Vec = Vec::new(); + for b in &batches { + let col = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + ids.push(col.value(i)); + } + } + + assert!( + !ids.contains(&1), + "stale pk=1 (now 'beta lance') leaked on an 'alpha' search; got ids={ids:?}" + ); + assert!( + ids.contains(&2), + "live pk=2 ('alpha foo') must still match 'alpha'; got ids={ids:?}" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index eca0255be1c..8b74f9efd79 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -15,7 +15,7 @@ use tracing::instrument; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkHashFilterExec, ROW_ADDRESS_COLUMN}; +use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, @@ -94,7 +94,7 @@ impl LsmScanPlanner { /// Each source is independently newest-per-PK (active via the fused /// [`MemTableDedupScanExec`](super::super::memtable::scanner), flushed via /// its within-generation deletion vector) and a cross-generation block-list - /// ([`PkHashFilterExec`]) drops any PK superseded by a newer generation. + /// ([`PkBlockFilterExec`]) drops any PK superseded by a newer generation. /// Each PK therefore survives in exactly one source, so a plain /// `UnionExec` carries at most one row per PK — no cross-source dedup, /// sort, or merge needed. `_memtable_gen` / `_rowaddr` are output-only and @@ -131,7 +131,6 @@ impl LsmScanPlanner { // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -148,7 +147,7 @@ impl LsmScanPlanner { // instead of scanning whole generations and trimming above the // union. Block-listed sources over-fetch by `overfetch_factor` so // cross-gen dedup drops still leave `n_needed` live rows; the - // PkHashFilter warns when that was not enough. The active memtable + // PkBlockFilter warns when that was not enough. The active memtable // is in-memory and within-gen append duplicates are resolved by its // own dedup, so it is never capped here. let n_needed = limit.map(|l| l.saturating_add(offset.unwrap_or(0))); @@ -177,7 +176,7 @@ impl LsmScanPlanner { // With a limit, `k = n_needed` arms the under-fetch warning; with // no limit `k = 0` keeps it silent. let scan = match blocked { - Some(set) => Arc::new(PkHashFilterExec::new( + Some(set) => Arc::new(PkBlockFilterExec::new( scan, self.pk_columns.clone(), set, @@ -473,13 +472,36 @@ mod integration_tests { .unwrap() } - /// Create a dataset at the given URI with the provided batches. + /// Create a dataset at the given URI with the provided batches. Also writes + /// the standalone PK sidecar (on `id`) so a flushed-generation source can be + /// probed by the block-list; harmless for a base table (never probed). async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + if has_id { + super::super::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset + } + + /// Build an in-memory memtable's `(batch_store, index_store)` with the PK + /// index enabled and populated (mirrors production — the block-list needs + /// the PK index to dedup in-memory generations). + fn pk_indexed(batches: &[RecordBatch]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(100)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for b in batches { + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(b, off, Some(bp)).unwrap(); + } + (batch_store, Arc::new(index)) } /// Setup a multi-level LSM structure with: @@ -530,10 +552,8 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Create active memtable - let batch_store = Arc::new(BatchStore::with_capacity(100)); - let index_store = Arc::new(IndexStore::new()); - let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); - let _ = batch_store.append(active_batch); + let (batch_store, index_store) = + pk_indexed(&[create_test_batch(&schema, &[5, 6, 7], "active")]); let active_memtable = InMemoryMemTables { active: InMemoryMemTableRef { @@ -575,18 +595,18 @@ mod integration_tests { // Verify the plan (gen DESC order: active -> gen2 -> gen1 -> base): // - plain UnionExec at top // - active arm: MemTableDedupScanExec (newest gen, not block-listed) - // - older arms: PkHashFilterExec (cross-gen block-list) -> LanceRead + // - older arms: PkBlockFilterExec (cross-gen block-list) -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... CoalescePartitionsExec UnionExec MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -609,9 +629,9 @@ mod integration_tests { // Verify the plan with `_memtable_gen` tags (gen DESC order): // - plain UnionExec at top - // - each arm: MemtableGenTagExec -> (PkHashFilterExec ->) data source + // - each arm: MemtableGenTagExec -> (PkBlockFilterExec ->) data source // - gen3 (active): MemtableGenTagExec -> MemTableDedupScanExec - // - gen2/gen1/base: MemtableGenTagExec -> PkHashFilterExec -> LanceRead + // - gen2/gen1/base: MemtableGenTagExec -> PkBlockFilterExec -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... @@ -620,13 +640,13 @@ mod integration_tests { MemtableGenTagExec: gen=gen3 MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -707,14 +727,14 @@ mod integration_tests { } // base/gen1/gen2 all hold PKs superseded by a newer generation, so each - // is wrapped in a `PkHashFilterExec`; the newest (active) arm is not. + // is wrapped in a `PkBlockFilterExec`; the newest (active) arm is not. let plan = scanner.create_plan().await.unwrap(); let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "filtered-read plan must apply the cross-gen block-list, got:\n{}", plan_str ); @@ -790,21 +810,21 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Frozen gen3 (sealed, NOT in the manifest) and active gen4. - let frozen_store = Arc::new(BatchStore::with_capacity(100)); - let _ = frozen_store.append(create_test_batch(&schema, &[6, 7], "frozen")); + let (frozen_store, frozen_index) = + pk_indexed(&[create_test_batch(&schema, &[6, 7], "frozen")]); let frozen = InMemoryMemTableRef { batch_store: frozen_store, - index_store: Arc::new(IndexStore::new()), + index_store: frozen_index, schema: schema.clone(), generation: 3, }; - let active_store = Arc::new(BatchStore::with_capacity(100)); - let _ = active_store.append(create_test_batch(&schema, &[7, 8], "active")); + let (active_store, active_index) = + pk_indexed(&[create_test_batch(&schema, &[7, 8], "active")]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store: active_store, - index_store: Arc::new(IndexStore::new()), + index_store: active_index, schema: schema.clone(), generation: 4, }, @@ -1029,12 +1049,12 @@ mod integration_tests { ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1097,14 +1117,14 @@ mod integration_tests { MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1173,6 +1193,8 @@ mod integration_tests { let mut index_store = IndexStore::new(); // Add BTree index on id column (field_id=0) index_store.add_btree("id_idx".to_string(), 0, "id".to_string()); + // Reuse it as the PK index so the block-list can dedup this generation. + index_store.enable_pk_index(&[("id".to_string(), 0)]); let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); let _ = batch_store.append(active_batch.clone()); @@ -1237,7 +1259,7 @@ mod integration_tests { // 1. Verify overall structure assert!(plan_str.contains("UnionExec"), "Should have UnionExec"); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "older generations should be block-list filtered" ); assert!( @@ -1425,7 +1447,6 @@ mod integration_tests { // Active memtable: id=10 inserted ("keep") then updated to NULL within // the same generation; id=20 ("active_20") is a control that matches. - let batch_store = Arc::new(BatchStore::with_capacity(16)); let active_batch = RecordBatch::try_new( schema.clone(), vec![ @@ -1438,12 +1459,12 @@ mod integration_tests { ], ) .unwrap(); - batch_store.append(active_batch).unwrap(); + let (batch_store, index_store) = pk_indexed(&[active_batch]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store, - index_store: Arc::new(IndexStore::new()), + index_store, schema: schema.clone(), generation: 1, }, diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index d1353e72dcc..3902eb04589 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -9,11 +9,14 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_array::{Array, RecordBatch}; -use arrow_schema::SchemaRef; +use arrow_schema::{SchemaRef, SortOptions}; use datafusion::common::ScalarValue; use datafusion::execution::TaskContext; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::limit::GlobalLimitExec; +use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::prelude::{Expr, SessionContext}; use futures::TryStreamExt; @@ -27,10 +30,7 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{ - BloomFilterGuardExec, CoalesceFirstExec, DedupDirection, WithinSourceDedupExec, - compute_pk_hash_from_scalars, -}; +use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, @@ -573,19 +573,29 @@ impl LsmPointLookupPlanner { // multiple rows sharing the target primary key. scanner.with_row_id(); let raw = scanner.create_plan().await?; - // Within the active memtable, larger `_rowid` = newer - // insert. After dedup there is exactly one row per PK. - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - raw, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); + // The filter already restricts to the exact PK value, so the + // scan yields that key's insert history. Within the active + // memtable larger `_rowid` = newer insert, so sorting `_rowid` + // DESC and keeping the first row picks the newest version — one + // row per (value-exact) PK. + let rowid_idx = raw.schema().index_of(lance_core::ROW_ID)?; + let ordering = LexOrdering::new(vec![PhysicalSortExpr { + expr: Arc::new(Column::new(lance_core::ROW_ID, rowid_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }]) + .ok_or_else(|| { + lance_core::Error::internal("point-lookup: failed to build _rowid ordering") + })?; + let newest: Arc = + Arc::new(SortExec::new(ordering, raw).with_fetch(Some(1))); // Per-source `_rowid` would collide with the base table's; // NULL it before canonicalization (the value is internal to // this arm). project_to_canonical drops it entirely when // the user didn't request `_rowid` in the projection. - null_columns(deduped, &[lance_core::ROW_ID])? + null_columns(newest, &[lance_core::ROW_ID])? } }; project_to_canonical(scan, &target) @@ -642,10 +652,6 @@ fn probe_position( pk_column: &str, pk_value: &ScalarValue, ) -> Result { - let Some(btree) = index_store.get_btree_by_column(pk_column) else { - return Ok(ProbePos::NoIndex); - }; - // Visible batches are the committed prefix [0, last_visible_idx]; each // `StoredBatch` carries its cumulative `row_offset`, so visibility and the // position→batch mapping are O(1)/O(log) with no per-probe allocation. @@ -661,22 +667,37 @@ fn probe_position( if visible_end == 0 { return Ok(ProbePos::Miss); } + let max_visible_row = visible_end - 1; - // Newest visible position of the key — a single seek-and-stop on the - // ordered skiplist (largest key ≤ (value, max_visible_row)). No range - // collect, no allocation. - let Some(pos) = btree.get_newest_visible(pk_value, visible_end - 1) else { + // A single-column primary key always has a value-keyed BTree (reused or + // auto-created — see `IndexStore::enable_pk_index`): collision-free, so one + // seek yields the answer with no re-check. Absent only when the table has no + // PK index, where the caller falls back to the plan path. + let Some(btree) = index_store.get_btree_by_column(pk_column) else { + return Ok(ProbePos::NoIndex); + }; + let Some(pos) = btree.get_newest_visible(pk_value, max_visible_row) else { return Ok(ProbePos::Miss); }; + let (batch_idx, row) = resolve_position(batch_store, last_visible_idx, pos)?; + Ok(ProbePos::Found { batch_idx, row }) +} - // Binary-search the owning batch by `row_offset` (appended in order). +/// Map a global row `position` to its `(batch_idx, row_in_batch)` by binary +/// searching the visible batch prefix on cumulative `row_offset` (batches are +/// appended in order). +fn resolve_position( + batch_store: &BatchStore, + last_visible_idx: usize, + position: u64, +) -> Result<(usize, usize)> { let (mut lo, mut hi) = (0usize, last_visible_idx); while lo < hi { let mid = lo + (hi - lo).div_ceil(2); let off = batch_store.get(mid).map(|b| b.row_offset).ok_or_else(|| { lance_core::Error::internal("point-lookup: batch index out of range during search") })?; - if off <= pos { + if off <= position { lo = mid; } else { hi = mid - 1; @@ -685,10 +706,7 @@ fn probe_position( let stored = batch_store .get(lo) .ok_or_else(|| lance_core::Error::internal("point-lookup: resolved batch missing"))?; - Ok(ProbePos::Found { - batch_idx: lo, - row: (pos - stored.row_offset) as usize, - }) + Ok((lo, (position - stored.row_offset) as usize)) } /// Gather `rows` from `batch_store`'s batch `batch_idx` into the `target` @@ -1097,8 +1115,8 @@ mod tests { // Regression: same primary key inserted twice into one active // memtable must return the *newest* row. The bug was that // `FilterExec → LIMIT 1` over an insert-ordered scan returned the - // first (oldest) match. `WithinSourceDedupExec` collapses by PK, - // keeping the row with the largest `_rowid` (insert order). + // first (oldest) match. The plan-path active arm now sorts `_rowid` + // DESC and keeps the first row (largest `_rowid` = newest insert). use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use futures::TryStreamExt; @@ -1118,17 +1136,17 @@ mod tests { let b_old = create_test_batch(&schema, &[1], "old"); let b_new = create_test_batch(&schema, &[1], "new"); let b_other = create_test_batch(&schema, &[2], "two"); - let (_, _, bp_old) = batch_store.append(b_old.clone()).unwrap(); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); index_store - .insert_with_batch_position(&b_old, 0, Some(bp_old)) + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) .unwrap(); - let (_, _, bp_new) = batch_store.append(b_new.clone()).unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); index_store - .insert_with_batch_position(&b_new, 1, Some(bp_new)) + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) .unwrap(); - let (_, _, bp_other) = batch_store.append(b_other.clone()).unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); index_store - .insert_with_batch_position(&b_other, 2, Some(bp_other)) + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) .unwrap(); let index_store = Arc::new(index_store); @@ -1168,6 +1186,88 @@ mod tests { ); } + #[tokio::test] + async fn test_point_lookup_probes_auto_created_pk_btree() { + // No user `add_btree` on the PK column — only `enable_pk_index`, which + // auto-creates a BTree on the primary key (the production default). The + // fast probe must resolve the newest visible version through that + // collision-free BTree rather than falling back to the plan path. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + + let schema = create_pk_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + // No `add_btree` — `enable_pk_index` auto-creates the PK BTree. + index_store.enable_pk_index(&[("id".to_string(), 0)]); + + // pk=1 written twice (the newer second), plus an unrelated pk=2. + let b_old = create_test_batch(&schema, &[1], "old"); + let b_new = create_test_batch(&schema, &[1], "new"); + let b_other = create_test_batch(&schema, &[2], "two"); + let (bp_old, off_old, _) = batch_store.append(b_old.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_old, off_old, Some(bp_old)) + .unwrap(); + let (bp_new, off_new, _) = batch_store.append(b_new.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_new, off_new, Some(bp_new)) + .unwrap(); + let (bp_other, off_other, _) = batch_store.append(b_other.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_other, off_other, Some(bp_other)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + let planner = LsmPointLookupPlanner::new(collector, vec!["id".to_string()], schema); + + // `lookup` takes the fast probe path (single-column PK, no system cols). + let hit = planner + .lookup(&[ScalarValue::Int32(Some(1))], None) + .await + .unwrap() + .expect("pk=1 must be found via the PK-position index probe"); + assert_eq!(hit.num_rows(), 1); + let name = hit + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + name.value(0), + "new_1", + "probe must return the newest version" + ); + + // An absent key resolves to None (no on-disk sources to consult). + assert!( + planner + .lookup(&[ScalarValue::Int32(Some(999))], None) + .await + .unwrap() + .is_none(), + "absent key must miss" + ); + } + #[tokio::test] async fn test_point_lookup_flushed_memtable_returns_newest_duplicate() { // Regression / invariant pin: when a flushed memtable contains two diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index 878063321aa..71e0674aa79 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -27,7 +27,6 @@ use crate::io::exec::TakeExec; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, WithinSourceDedupExec}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns, @@ -38,10 +37,12 @@ use crate::session::Session; /// Plans vector search queries over LSM data. /// /// Each source is independently newest-per-PK before the union — the active -/// memtable via an over-fetched KNN + within-source dedup, flushed generations -/// via their within-generation deletion vector — and the cross-generation -/// block-list ([`super::exec::PkHashFilterExec`]) drops any PK superseded by a -/// newer generation. So each PK reaches the union from exactly one source and a +/// memtable via an over-fetched KNN + a newest-per-PK recency filter +/// ([`super::exec::NewestPkFilterExec`], which drops a hit that isn't the newest +/// visible version of its PK), flushed generations via their within-generation +/// deletion vector — and the cross-generation block-list +/// ([`super::exec::PkBlockFilterExec`]) drops any PK superseded by a newer +/// generation. So each PK reaches the union from exactly one source and a /// distance-ordered merge yields the global top-k; no cross-source dedup is /// needed. /// @@ -54,15 +55,15 @@ use crate::session::Session; /// UnionExec /// ProjectionExec (canonical output schema) /// SortExec(_distance, fetch=k) -/// WithinSourceDedupExec: KeepMaxRowAddr (active) +/// NewestPkFilterExec: newest-per-PK recency (active) /// KNNExec: active memtable, fetch=ceil(k*overfetch) /// ProjectionExec (canonical output schema) /// ProjectionExec (null_columns _rowid) -/// PkHashFilterExec: block-list (flushed) +/// PkBlockFilterExec: block-list (flushed) /// KNNExec: flushed gen N, fetch=ceil(k*overfetch) (fast_search) /// … one per flushed gen … /// ProjectionExec (canonical output schema) -/// PkHashFilterExec: block-list (base) +/// PkBlockFilterExec: block-list (base) /// KNNExec: base table, k (fast_search)[.refine()?] /// ``` /// @@ -168,7 +169,7 @@ impl LsmVectorSearchPlanner { /// the rows that filtering drops: /// /// - `factor < 1.0` (e.g. `0.0`): **stale filtering off.** The per-source - /// block-list / [`super::exec::PkHashFilterExec`] is not built or applied, + /// block-list / [`super::exec::PkBlockFilterExec`] is not built or applied, /// so rows superseded by a newer generation can surface. The global PK /// dedup still runs, so it still suppresses stale copies in the cases /// where both the stale and the fresh row reach it. @@ -210,11 +211,10 @@ impl LsmVectorSearchPlanner { // live candidates after the post-filter. let overfetch_factor = overfetch_factor.max(1.0); - // Per-source PK-hash block sets (`NEWER(G)`; base = union of all gens). + // Per-source PK block sets (`NEWER(G)`; base = union of all gens). // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -270,29 +270,46 @@ impl LsmVectorSearchPlanner { .await?; let mut knn_plans = Vec::new(); - for ((_, is_base, is_active, blocked, _), knn) in arm_inputs.iter().zip(built) { + // `build_knn_plan` returns each active arm's max-visible snapshot + // alongside its plan; the active arm's NewestPkFilterExec needs both it + // and `source` (for the batch/index stores), so neither is discarded. + for ((source, is_base, is_active, blocked, _), (knn, active_max_visible)) in + arm_inputs.iter().zip(built) + { let is_base = *is_base; let is_active = *is_active; let blocked = *blocked; // Make each source independently newest-per-PK before the union: // * active: the append-only HNSW returns one node per inserted - // version, so collapse duplicate PKs to the newest insert - // (KeepMaxRowAddr on `_rowid`) and re-sort by distance. This - // stays probabilistic — a fresh version evicted from the - // over-fetched top-k still leaks. + // version *and* leaves stale versions of updated PKs live. The + // recency filter keeps only the hit that is the newest visible + // version of its PK (per the maintained MVCC PK-position index), + // closing the predicate-crossing stale read, then re-sort by + // distance. // * flushed/base: drop cross-gen superseded rows via the // block-list (within-gen is handled by the flushed DV). let knn = if is_active { - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - knn, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); - sort_by_distance(deduped, k)? + let (batch_store, index_store) = match source { + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => (batch_store.clone(), index_store.clone()), + _ => unreachable!("is_active implies ActiveMemTable"), + }; + let filtered: Arc = + Arc::new(super::exec::NewestPkFilterExec::new( + knn, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store, + batch_store, + active_max_visible.expect("active arm returns its max_visible snapshot"), + )); + sort_by_distance(filtered, k)? } else { match blocked { - Some(set) => Arc::new(super::exec::PkHashFilterExec::new( + Some(set) => Arc::new(super::exec::PkBlockFilterExec::new( knn, self.pk_columns.clone(), set.clone(), @@ -385,11 +402,15 @@ impl LsmVectorSearchPlanner { merged_sorted }; - // Under-fetch is warned per-source inside `PkHashFilterExec`. + // Under-fetch is warned per-source inside `PkBlockFilterExec`. Ok(result) } /// Build KNN plan for a single data source. + /// + /// Returns the plan and, for the active memtable, the `max_visible_batch_position` + /// snapshot its scanner latched — threaded into the recency filter so it keys + /// on the same snapshot the search saw (`None` for base / flushed sources). async fn build_knn_plan( &self, source: &LsmDataSource, @@ -398,7 +419,7 @@ impl LsmVectorSearchPlanner { nprobes: usize, projection: Option<&[String]>, refine: bool, - ) -> Result> { + ) -> Result<(Arc, Option)> { match source { LsmDataSource::BaseTable { dataset } => { let mut scanner = dataset.scan(); @@ -423,7 +444,7 @@ impl LsmVectorSearchPlanner { if refine { scanner.refine(1); } - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::FlushedMemTable { path, .. } => { let dataset = @@ -439,7 +460,7 @@ impl LsmVectorSearchPlanner { scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); scanner.fast_search(); - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::ActiveMemTable { batch_store, @@ -457,8 +478,8 @@ impl LsmVectorSearchPlanner { build_scanner_projection(projection, &self.base_schema, &self.pk_columns); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); // Expose `_rowid` (BatchStore row offset, monotonic with - // insert order) so [`WithinSourceDedupExec`] can collapse - // duplicate-PK rows to the newest insert. The value is + // insert order) so `NewestPkFilterExec` can compare each hit's + // position against the PK-position index. The value is // per-source and NULL'd before reaching the canonical merge. // (VectorIndexExec only plumbs `with_row_id`, not // `with_row_address`, but the two yield identical values @@ -468,7 +489,9 @@ impl LsmVectorSearchPlanner { scanner.nearest(&self.vector_column, query_arr, k); scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Capture the scanner's own latched snapshot for the recency filter. + Ok((plan, Some(scanner.max_visible_batch_position()))) } } } @@ -588,10 +611,19 @@ mod tests { async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + // Also write the standalone PK sidecar (on `id`) so a flushed-generation + // source can be probed by the block-list (harmless for a base table). + if has_id { + crate::dataset::mem_wal::scanner::block_list::write_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset } #[tokio::test] @@ -662,6 +694,7 @@ mod tests { // Active memtable with HNSW index over the "vector" column. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -780,6 +813,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -859,6 +893,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -972,6 +1007,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1028,8 +1064,7 @@ mod tests { plan_str ); assert!( - plan_str.contains("WithinSourceDedupExec") - && plan_str.contains("SortPreservingMergeExec"), + plan_str.contains("NewestPkFilterExec") && plan_str.contains("SortPreservingMergeExec"), "expected per-arm dedup + distance merge, got:\n{}", plan_str ); @@ -1112,6 +1147,7 @@ mod tests { // "right" vector close to the query, plus an unrelated pk=2. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1231,6 +1267,7 @@ mod tests { // Active memtable: id=3 with HNSW index. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1457,9 +1494,9 @@ mod tests { #[tokio::test] async fn test_vector_search_dedup_within_active_memtable() { // Regression: same PK inserted twice into one active memtable with - // *different* vectors. HNSW indexes each as a distinct node, so - // without WithinSourceDedupExec a KNN can return both candidates - // for the same PK and pollute top-k. The newer insert must win. + // *different* vectors. HNSW indexes each as a distinct node, so without + // the recency filter a KNN can return both candidates for the same PK + // and pollute top-k. The newer insert must win. use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use datafusion::prelude::SessionContext; @@ -1471,6 +1508,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1534,14 +1572,14 @@ mod tests { .await .unwrap(); - // The active arm collapses duplicate-PK HNSW nodes itself via - // WithinSourceDedupExec — there is no cross-source dedup fallback. + // The active arm collapses duplicate-PK HNSW nodes itself via the + // recency filter — there is no cross-source dedup fallback. let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("WithinSourceDedupExec"), + plan_str.contains("NewestPkFilterExec"), "active vector arm must self-dedup, got:\n{}", plan_str ); @@ -1570,10 +1608,120 @@ mod tests { ); } + #[tokio::test] + async fn test_vector_search_active_stale_update_out_of_neighborhood() { + // BUG REPRODUCTION (vector case: a PK update that moves out of the neighborhood). + // + // Within a *single* active memtable, pk=1 is first inserted ON the query + // (distance ~0), then updated to a FAR vector. The append-only HNSW keeps + // both nodes live. A result-set dedup only collapses duplicate PKs that + // are BOTH present in the over-fetched candidate set. + // + // Here the fresh (far) pk=1 is evicted from the candidate set — there are + // enough nearer filler rows that it ranks below the fetch cutoff — so the + // dedup never sees it and the STALE near pk=1 leaks as the nearest hit. + // This is the predicate-crossing hole: the row that *would* suppress the + // stale version isn't in the result set, so result-set dedup can't help. + // + // Desired (NewestPkFilterExec) behaviour: pk=1's newest row-position is + // the far one, computed predicate-independently over the whole memtable, + // so the stale near node is dropped and pk=1 must NOT surface at ~0. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use datafusion::prelude::SessionContext; + use futures::TryStreamExt; + + let schema = create_vector_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); + index_store.add_hnsw( + "vector_hnsw".to_string(), + 1, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + 64, + 8, + ); + + // First append: stale pk=1 ON the query, plus five filler rows strictly + // farther than pk=1 but far nearer than the eventual fresh pk=1. + let q = [0.1, 0.2, 0.3, 0.4]; + let stale_then_fillers = batch_rows( + &schema, + &[ + (1, q), + (10, [0.11, 0.21, 0.31, 0.41]), + (11, [0.13, 0.23, 0.33, 0.43]), + (12, [0.15, 0.25, 0.35, 0.45]), + (13, [0.17, 0.27, 0.37, 0.47]), + (14, [0.19, 0.29, 0.39, 0.49]), + ], + ); + let (bp0, off0, _) = batch_store.append(stale_then_fillers.clone()).unwrap(); + index_store + .insert_with_batch_position(&stale_then_fillers, off0, Some(bp0)) + .unwrap(); + + // Second append: the UPDATE — pk=1 moved far from the query. This is the + // newest version (largest row position) but it sits well outside top-k. + let fresh_pk1 = batch_rows(&schema, &[(1, [9.0, 9.0, 9.0, 9.0])]); + let (bp1, off1, _) = batch_store.append(fresh_pk1.clone()).unwrap(); + index_store + .insert_with_batch_position(&fresh_pk1, off1, Some(bp1)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = uuid::Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmVectorSearchPlanner::new( + collector, + vec!["id".to_string()], + schema, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + ); + + // k=3, no over-fetch: the candidate set is {pk1@near, two nearest + // fillers}; fresh pk1@far ranks 7th and never enters the candidates. + let query = create_query_vector(); + let plan = planner + .plan_search(&query, 3, 1, None, false, 1.0) + .await + .unwrap(); + let ctx = SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let rows = collect_id_dist(&batches); + + assert!( + !rows.iter().any(|&(id, d)| id == 1 && d.abs() < 1e-3), + "stale near pk=1 leaked: its live vector is far from the query, so it \ + must not appear at distance ~0. results={:?}", + rows + ); + } + #[tokio::test] async fn test_vector_search_stale_read_when_fresh_falls_out_of_top_k() { // Regression for the cross-generation stale-read gap that the - // PkHashFilterExec block-list closes. + // PkBlockFilterExec block-list closes. // // Scenario: // * Base (gen 0): stale pk=1 sitting on the query (distance ~0). @@ -1608,6 +1756,7 @@ mod tests { // active arm surfaces pk=2 and drops fresh pk=1. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1804,6 +1953,7 @@ mod tests { // Active (gen 1): pk 1,2,3 re-inserted with a far vector (the fresh value). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -2008,6 +2158,7 @@ mod tests { // Active: (1,1) re-inserted far (fresh) + an unrelated nearby (2,2). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id1".to_string(), 0), ("id2".to_string(), 1)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -2112,6 +2263,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, diff --git a/rust/lance/src/dataset/mem_wal/util.rs b/rust/lance/src/dataset/mem_wal/util.rs index d1413b84b2a..3f5090f6b40 100644 --- a/rust/lance/src/dataset/mem_wal/util.rs +++ b/rust/lance/src/dataset/mem_wal/util.rs @@ -169,6 +169,16 @@ pub fn flushed_memtable_path( shard_base_path(base_path, shard_id).join(format!("{}_gen_{}", random_hash, generation)) } +/// Subdirectory of a flushed generation holding its standalone primary-key +/// dedup index (a sidecar BTree, not registered in the manifest). Both the +/// flush writer and the block-list probe join this onto the generation path. +pub const PK_INDEX_DIR: &str = "_pk_index"; + +/// Path to a flushed generation's standalone primary-key dedup index. +pub fn pk_index_path(gen_path: &Path) -> Path { + gen_path.clone().join(PK_INDEX_DIR) +} + /// Generate an 8-character random hex string for flushed MemTable directories. pub fn generate_random_hash() -> String { let bytes: [u8; 4] = rand::random(); diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 441da920b57..57acaf42ccd 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -846,6 +846,16 @@ async fn replay_memtable_from_wal( Ok(position) } +/// Pair each primary-key column name with its field id (both derived from the +/// schema's primary key, in the same order) for [`IndexStore::enable_pk_index`]. +fn pk_index_columns(pk_columns: &[String], pk_field_ids: &[i32]) -> Vec<(String, i32)> { + pk_columns + .iter() + .cloned() + .zip(pk_field_ids.iter().copied()) + .collect() +} + /// Shared state for writer operations. struct SharedWriterState { state: Arc>, @@ -855,6 +865,9 @@ struct SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + /// Primary-key column names, used to (re)enable the PK-position index on + /// each fresh active memtable created at freeze. + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -870,6 +883,7 @@ impl SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -882,6 +896,7 @@ impl SharedWriterState { config, schema, pk_field_ids, + pk_columns, max_memtable_batches, max_memtable_rows, index_configs, @@ -907,13 +922,17 @@ impl SharedWriterState { self.max_memtable_batches, )?; - if !self.index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Build an IndexStore when there are user indexes *or* a primary key: + // the PK dedup index (and its flushed on-disk sidecar) is required for + // cross-generation dedup even when no secondary index is configured. + if !self.index_configs.is_empty() || !self.pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( &self.index_configs, self.max_memtable_rows, self.max_memtable_batches, - )?); - new_memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&self.pk_columns, &self.pk_field_ids)); + new_memtable.set_indexes_arc(Arc::new(indexes)); } let mut old_memtable = std::mem::replace(&mut state.memtable, new_memtable); @@ -1287,11 +1306,9 @@ impl ShardWriter { ) -> Result { // Create MemTable with primary key field IDs from schema let lance_schema = Schema::try_from(schema.as_ref())?; - let pk_field_ids: Vec = lance_schema - .unenforced_primary_key() - .iter() - .map(|f| f.id) - .collect(); + let pk_fields = lance_schema.unenforced_primary_key(); + let pk_field_ids: Vec = pk_fields.iter().map(|f| f.id).collect(); + let pk_columns: Vec = pk_fields.iter().map(|f| f.name.clone()).collect(); let mut memtable = MemTable::with_capacity( schema.clone(), manifest.current_generation, @@ -1300,14 +1317,18 @@ impl ShardWriter { config.max_memtable_batches, )?; - // Create indexes if configured and set them on the MemTable. - if !index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + // Create indexes if configured and set them on the MemTable. The + // PK-position index is enabled before any WAL replay below so replayed + // rows are recorded in it. A primary key alone (no secondary index) + // still needs the PK index so flush writes its on-disk dedup sidecar. + if !index_configs.is_empty() || !pk_columns.is_empty() { + let mut indexes = IndexStore::from_configs( index_configs, config.max_memtable_rows, config.max_memtable_batches, - )?); - memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&pk_columns, &pk_field_ids)); + memtable.set_indexes_arc(Arc::new(indexes)); } // Replay any WAL entries written after the last successfully-flushed @@ -1395,6 +1416,7 @@ impl ShardWriter { config.clone(), schema.clone(), pk_field_ids, + pk_columns, config.max_memtable_batches, config.max_memtable_rows, index_configs.to_vec(), From bb72a16194f27aa08bf86cb58657d3495e2e37f2 Mon Sep 17 00:00:00 2001 From: DanielMao <52651331+DanielMao1@users.noreply.github.com> Date: Thu, 18 Jun 2026 03:12:57 +0800 Subject: [PATCH 131/177] fix: return error instead of panicking on zero-dimension fixed-size-list columns (#7247) Closes #5102 ## Problem A fixed-size-list column with dimension 0 panics with `attempt to divide by zero` (`rust/lance-encoding/src/data.rs`, `FixedSizeListBlock::num_values`). As of pylance 7.0.0 the panic fires on **write** for every storage version (`stable`/`2.1`/`2.2`), and reading datasets persisted by older writers (which accepted such columns) panics as well. Reproduction details are in the issue comment: https://github.com/lance-format/lance/issues/5102#issuecomment-4689259100 ## Approach Following the maintainer guidance in #5102 (error, not panic), this adds two small guards at boundaries that already return `Result`, instead of changing `DataBlock::num_values()` to return `Result` (the approach that made #5159 balloon across the whole encoding crate): 1. **Write side**: `Schema::validate()` rejects zero-dimension fixed-size-list fields (including nested ones). `validate()` runs inside `Schema::try_from(&ArrowSchema)`, so every write entry point surfaces a clean schema error instead of a panic. Writes currently panic on every storage version, so no working flow changes behavior. 2. **Read side (defensive)**: the structural and legacy field-scheduler builders reject zero-dimension fixed-size lists with an invalid-input error, so datasets persisted by old writers fail cleanly at scheduling time instead of crashing the process. ## How the guards sit in the data flow ![guards](https://raw.githubusercontent.com/DanielMao1/lance/pr-assets/zero-dim-fsl-guards.png) Two facts that shape the design: - `Schema::try_from(&ArrowSchema)` calls `validate()` internally and every write path performs this conversion, so guard 1 in one place covers all write entry points. - Guard 2 exists because writers up to ~2026-04 could still persist zero-dimension columns under the `stable` (2.0) storage version; reading those files must not crash the process. ## Tests - `lance-core`: `Schema::try_from` rejects zero-dim FSL at top level and nested in a struct; positive dimensions still validate. - `lance-encoding`: the scheduler guard rejects zero-dim FSL, including FSL-nested-in-FSL, and accepts positive dimensions. - Python: parametrized over `legacy`/`stable`/`2.1`, `write_dataset` now raises a clean `OSError` (same mapping as other schema validation errors) instead of `PanicException`. Co-authored-by: Daniel Mao --- python/python/tests/test_dataset.py | 19 ++++++ rust/lance-core/src/datatypes.rs | 1 + rust/lance-core/src/datatypes/schema.rs | 90 ++++++++++++++++++++++++- rust/lance-encoding/src/decoder.rs | 52 +++++++++++++- 4 files changed, 160 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 39dac98aec6..45866f3c4da 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -93,6 +93,25 @@ def test_roundtrip_types(tmp_path: Path): assert dataset.to_table() == table +@pytest.mark.parametrize("data_storage_version", ["legacy", "stable", "2.1"]) +def test_write_zero_dimension_fixed_size_list( + tmp_path: Path, data_storage_version: str +): + # Zero-dimension fixed-size lists must be rejected with a clean error + # instead of a divide-by-zero panic (#5102) + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("vec", pa.list_(pa.float32(), 0)), + ] + ) + table = pa.table({"id": [1], "vec": [[]]}, schema=schema) + with pytest.raises(OSError, match="dimension must be a positive integer"): + lance.write_dataset( + table, tmp_path / "ds.lance", data_storage_version=data_storage_version + ) + + def test_dataset_overwrite(tmp_path: Path): table1 = pa.Table.from_pylist([{"a": 1, "b": 2}, {"a": 10, "b": 20}]) base_dir = tmp_path / "test" diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 628f9cf9a90..8837037c308 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -25,6 +25,7 @@ pub use field::{ pub use schema::{ BlobHandling, FieldRef, OnMissing, Projectable, Projection, Schema, escape_field_path_for_project, format_field_path, parse_field_path, + validate_fixed_size_list_dimensions, }; pub static BLOB_DESC_FIELDS: LazyLock = LazyLock::new(|| { diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs index f959c37672f..d13eb476359 100644 --- a/rust/lance-core/src/datatypes/schema.rs +++ b/rust/lance-core/src/datatypes/schema.rs @@ -11,7 +11,7 @@ use std::{ use crate::deepsize::DeepSizeOf; use arrow_array::RecordBatch; -use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_arrow::*; use super::field::{Field, OnTypeMismatch, SchemaCompareOptions}; @@ -110,6 +110,29 @@ impl<'a> Iterator for SchemaFieldIterPreOrder<'a> { } } +/// Reject `FixedSizeList` types whose dimension is not a positive integer. +/// +/// The row count of a fixed-size list is derived by dividing the number of +/// child items by the dimension, so a zero dimension panics with a +/// divide-by-zero further down the write path (see issue #5102). A +/// `FixedSizeList` of a `FixedSizeList` over a primitive collapses into a +/// single leaf field, so the pre-order field walk never visits the inner list; +/// recurse through the nested list types here to catch an inner zero dimension. +/// +/// Shared by [`Schema::validate`] on the write path and the decoder's +/// field-scheduler builders on the read path. +pub fn validate_fixed_size_list_dimensions(field_name: &str, data_type: &DataType) -> Result<()> { + if let DataType::FixedSizeList(inner, dimension) = data_type { + if *dimension <= 0 { + return Err(Error::schema(format!( + "Field \"{field_name}\" contains a FixedSizeList with dimension {dimension}; dimension must be a positive integer" + ))); + } + validate_fixed_size_list_dimensions(field_name, inner.data_type())?; + } + Ok(()) +} + impl Schema { /// The unenforced primary key fields in the schema, ordered by position. /// @@ -346,6 +369,10 @@ impl Schema { field.id, self ))); } + // The row count of a fixed-size list is derived by dividing the + // number of items by the dimension, so a zero dimension would + // panic with a divide-by-zero further down the write path. + validate_fixed_size_list_dimensions(&field.name, &field.data_type())?; } Ok(()) @@ -2825,6 +2852,67 @@ mod tests { assert!(paths.contains(&"name".to_string())); } + #[test] + fn test_validate_rejects_zero_dimension_fixed_size_list() { + // A zero dimension divides-by-zero further down the write path (#5102) + let fsl = |dimension: i32| { + ArrowDataType::FixedSizeList( + Arc::new(ArrowField::new("item", ArrowDataType::Float32, true)), + dimension, + ) + }; + + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(0), true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // Nested inside a struct is rejected too + let arrow_schema = ArrowSchema::new(vec![ArrowField::new( + "outer", + ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new( + "vec", + fsl(0), + true, + )])), + true, + )]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A zero-dimension FixedSizeList nested inside a positive-dimension + // FixedSizeList collapses into a single leaf field, so the inner + // dimension is not visited by the pre-order field walk and must still + // be rejected: FixedSizeList(FixedSizeList(Float32, 0), 4). + let nested = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(0), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested, true)]); + let err = Schema::try_from(&arrow_schema).unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + // A positive dimension still validates, including nested lists + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", fsl(2), true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + let nested_ok = + ArrowDataType::FixedSizeList(Arc::new(ArrowField::new("inner", fsl(2), true)), 4); + let arrow_schema = ArrowSchema::new(vec![ArrowField::new("vec", nested_ok, true)]); + assert!(Schema::try_from(&arrow_schema).is_ok()); + } + #[test] fn test_schema_unenforced_clustering_key() { use crate::datatypes::field::LANCE_UNENFORCED_CLUSTERING_KEY_POSITION; diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index 59886d337d1..a30d5ed93a9 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -226,7 +226,9 @@ use futures::stream::{self, BoxStream}; use futures::{FutureExt, StreamExt}; use lance_arrow::DataTypeExt; use lance_core::cache::LanceCache; -use lance_core::datatypes::{BLOB_DESC_LANCE_FIELD, Field, Schema}; +use lance_core::datatypes::{ + BLOB_DESC_LANCE_FIELD, Field, Schema, validate_fixed_size_list_dimensions, +}; use lance_core::utils::futures::{FinallyStreamExt, StreamOnDropExt}; use lance_core::utils::parse::parse_env_as_bool; use log::{debug, trace, warn}; @@ -723,6 +725,7 @@ impl CoreFieldDecoderStrategy { column_infos: &mut ColumnInfoIter, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_structural_primitive(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = Box::new(StructuralPrimitiveFieldScheduler::try_new( @@ -832,6 +835,7 @@ impl CoreFieldDecoderStrategy { buffers: FileBuffers, ) -> Result> { let data_type = field.data_type(); + validate_fixed_size_list_dimensions(&field.name, &data_type)?; if Self::is_primitive_legacy(&data_type) { let column_info = column_infos.expect_next()?; let scheduler = self.create_primitive_scheduler(field, column_info, buffers)?; @@ -2887,6 +2891,52 @@ pub async fn decode_batch( mod tests { use super::*; + #[test] + fn test_read_zero_dimension_fsl_errors_instead_of_panicking() { + // Simulates reading a column whose stored schema declares a + // zero-dimension FixedSizeList, as old writers (before #5102) could + // persist. The read plan is built by the field-scheduler factories, + // which run the dimension guard before touching any column data, so + // an empty column iterator is sufficient to reach the guard. The read + // must surface a clean error rather than a divide-by-zero panic. + use arrow_schema::Field as ArrowField; + + let zero_dim = DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 0, + ); + let field = Field::try_from(&ArrowField::new("vec", zero_dim, true)).unwrap(); + let strategy = CoreFieldDecoderStrategy::default(); + + let mut structural_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_structural_field_scheduler(&field, &mut structural_columns) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + + let mut legacy_columns = ColumnInfoIter::new(vec![], &[]); + let err = strategy + .create_legacy_field_scheduler( + &field, + &mut legacy_columns, + FileBuffers { + positions_and_sizes: &[], + }, + ) + .unwrap_err(); + assert!( + err.to_string() + .contains("dimension must be a positive integer"), + "unexpected error: {}", + err + ); + } + #[test] fn test_coalesce_indices_to_ranges_with_single_index() { let indices = vec![1]; From 996e0632c33f5d25fa4c9761aa0a66fb6f38caa9 Mon Sep 17 00:00:00 2001 From: Julian Date: Wed, 17 Jun 2026 12:38:28 -0700 Subject: [PATCH 132/177] perf(scalar): compile IsIn predicate once across BTree pages (#7287) ## What `BTreeIndex::search` rebuilt the full `col IN (...)` physical expression on every page it touched. For a large IN-list spanning many pages this is O(pages x values) -- the expression (and its hash set) is reconstructed per page even though it is identical across pages. This compiles the predicate once in `BTreeIndex::search` and reuses it across pages via a new `FlatIndex::search_prebuilt`. Membership is O(1) per row regardless of set size, so only the repeated build was wasted work. Resolves the existing `// TODO` in `search_page`. ## Why `col IN ()` is used to resolve big key sets to row ids. On a real 83M-row table, an `IsIn` of ~46K values took 13.4s, almost entirely per-page expression construction. ## Result Same table/query, index lookup **13.4s -> 3.2s**. Cost is now bounded by pages touched + rows scanned, independent of IN-list size (a local 80K-value lookup over a multi-page index runs ~130ms and is flat in the value count). ## Notes - No public API or behavior change -- the predicate and its evaluation are identical, just built once instead of per page. - `cargo test -p lance-index` passes (279 scalar tests); `cargo fmt` clean. Co-authored-by: Yuan Gao Co-authored-by: Claude Opus 4.8 --- rust/lance-index/src/scalar/btree.rs | 55 ++++++++++++++++++----- rust/lance-index/src/scalar/btree/flat.rs | 17 ++++++- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index c46702760cc..cd24f251718 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -46,8 +46,11 @@ use datafusion::physical_plan::{ sorts::sort_preserving_merge::SortPreservingMergeExec, stream::RecordBatchStreamAdapter, union::UnionExec, }; -use datafusion_common::{DataFusionError, ScalarValue}; -use datafusion_physical_expr::{PhysicalSortExpr, expressions::Column}; +use datafusion_common::{DFSchema, DataFusionError, ScalarValue}; +use datafusion_expr::execution_props::ExecutionProps; +use datafusion_physical_expr::{ + PhysicalExpr, PhysicalSortExpr, create_physical_expr, expressions::Column, +}; use futures::{ FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, future::BoxFuture, @@ -1643,11 +1646,28 @@ impl BTreeIndex { FlatIndex::try_new(serialized_page) } + /// Compile a sargable predicate into a physical expr against the per-page + /// schema ([values, ids]). Built once in `search` and shared across pages so + /// a large IN-list is not re-materialized for every page. + fn compile_predicate(&self, query: &SargableQuery) -> Result> { + let schema = Arc::new(Schema::new(vec![ + Field::new(BTREE_VALUES_COLUMN, self.data_type.clone(), true), + Field::new(BTREE_IDS_COLUMN, DataType::UInt64, false), + ])); + let df_schema = DFSchema::try_from(schema)?; + Ok(create_physical_expr( + &query.to_expr(BTREE_VALUES_COLUMN.to_string()), + &df_schema, + &ExecutionProps::default(), + )?) + } + async fn search_page( &self, query: &SargableQuery, matches: Matches, index_reader: LazyIndexReader, + prebuilt: Option<&Arc>, metrics: &dyn MetricsCollector, ) -> Result { let subindex = self @@ -1655,13 +1675,12 @@ impl BTreeIndex { .await?; match matches { - Matches::Some(_) => { - // TODO: If this is an IN query we can perhaps simplify the subindex query by restricting it to the - // values that might be in the page. E.g. if we are searching for X IN [5, 3, 7] and five is in pages - // 1 and 2 and three is in page 2 and seven is in pages 8 and 9, then when searching page 2 we only need - // to search for X IN [5, 3] - subindex.search(query, metrics) - } + // For a large IsIn the predicate is compiled once (see `search`) and + // reused here, instead of rebuilding the whole IN-list per page. + Matches::Some(_) => match prebuilt { + Some(expr) => subindex.search_prebuilt(expr, metrics), + None => subindex.search(query, metrics), + }, Matches::All(_) => Ok(match query { // This means we hit an all-null page so just grab all row ids as true SargableQuery::IsNull() => subindex.all_ignore_nulls(), @@ -2119,13 +2138,27 @@ impl ScalarIndex for BTreeIndex { } } + // Compile a large IsIn predicate once and reuse it across every page; + // rebuilding the full IN-list per page is O(pages * values) and dominates + // the lookup for sets with many values. + let prebuilt = match query { + SargableQuery::IsIn(_) => Some(self.compile_predicate(query)?), + _ => None, + }; + let lazy_index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let page_tasks = pages .into_iter() .map(|page_index| { - self.search_page(query, page_index, lazy_index_reader.clone(), metrics) - .boxed() + self.search_page( + query, + page_index, + lazy_index_reader.clone(), + prebuilt.as_ref(), + metrics, + ) + .boxed() }) .collect::>(); debug!("Searching {} btree pages", page_tasks.len()); diff --git a/rust/lance-index/src/scalar/btree/flat.rs b/rust/lance-index/src/scalar/btree/flat.rs index 7663d8478c1..744f6a3cb3c 100644 --- a/rust/lance-index/src/scalar/btree/flat.rs +++ b/rust/lance-index/src/scalar/btree/flat.rs @@ -11,7 +11,7 @@ use arrow_array::{ use datafusion_common::DFSchema; use datafusion_expr::execution_props::ExecutionProps; -use datafusion_physical_expr::create_physical_expr; +use datafusion_physical_expr::{PhysicalExpr, create_physical_expr}; use lance_arrow::RecordBatchExt; use lance_core::Result; use lance_core::cache::{CacheCodecImpl, CacheEntryReader, CacheEntryWriter}; @@ -237,7 +237,22 @@ impl FlatIndex { // No shortcut possible, need to actually evaluate the query let expr = query.to_expr(BTREE_VALUES_COLUMN.to_string()); let expr = create_physical_expr(&expr, &self.df_schema, &ExecutionProps::default())?; + self.eval_expr(&expr) + } + + /// Evaluate a predicate compiled once by the caller. Lets a large IsIn that + /// spans many pages build the physical expr a single time instead of + /// rebuilding the whole IN-list per page (the dominant cost of a big lookup). + pub fn search_prebuilt( + &self, + expr: &Arc, + metrics: &dyn MetricsCollector, + ) -> Result { + metrics.record_comparisons(self.data.num_rows()); + self.eval_expr(expr) + } + fn eval_expr(&self, expr: &Arc) -> Result { let predicate = expr.evaluate(&self.data)?; let predicate = predicate.into_array(self.data.num_rows())?; let predicate = predicate From 64c7fbd45886419e887523fc933617c2b33d1626 Mon Sep 17 00:00:00 2001 From: XY Zhan Date: Wed, 17 Jun 2026 15:53:36 -0400 Subject: [PATCH 133/177] perf: batch fragment-reuse index remap into a single rebuild + commit (#7317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `remap_index` — the catch-up that physically applies a fragment-reuse index after a deferred-remap compaction — applied the reuse index **one version at a time**, rebuilding the index file and committing **once per reuse version**. An index touched by K deferred compactions paid **K full index rebuilds + K commits** for a result identical to applying all K at once. This is worst exactly when the reuse index has accumulated many versions before a remap runs. ## Change Compose the whole chain and rebuild once: - **Row addresses:** `FragReuseIndex::remap_row_id` already chains every version (and passes through addresses a version does not touch), so mapping the union of all versions' keys yields a single **baseline → final** address map, applied in one rebuild. - **Coverage bitmap:** composed in one pass with the same all-or-nothing / straddle-error semantics (chaining is automatic — a version's new fragments are the next version's old fragments). `data_predates_version` is evaluated against the fixed baseline since there are no intermediate commits. - One `CreateIndex` commit instead of one per version. ## Why the composed map is not filtered by the fragment bitmap A tempting way to keep the composed map small is to drop keys whose fragment isn't in the index's current `fragment_bitmap`. That optimization is **not** safe — the per-version loop never did it, and this PR keeps it that way. In the sibling-coverage-remap case, remapping one index commits a manifest that coverage-remaps a *sibling* index's bitmap onto the new fragments and persists it *before* the sibling's own data is remapped. The sibling's on-disk bitmap then shows the new fragments while its data still holds old addresses. Filtering the map by that bitmap would drop exactly the keys the sibling needs, leaving an **empty** map — and `index::remap_index` treats an all-`None`/empty map as `RemapResult::Keep`, reusing the stale index files while the version is bumped and the reuse index trims, so the index would end up pointing at dead fragments. So the composed map maps every old address the reuse index touched; addresses an index doesn't store are simply never looked up (the map stays bounded by the rows the reuse index touched). ## Tests - `test_remap_index_batches_multiple_reuse_versions` — a multi-version reuse chain must rebuild + commit exactly once. - `test_cleanup_frag_reuse_index_multiple_indices` — extended with a post-remap data-correctness scan so it asserts each remapped index resolves to **live rows**, not just that versions advance and the reuse index trims. (A bitmap-filtered map would make the sibling index return 0 of 1000 rows here.) ``` cargo test -p lance --lib frag_reuse::tests # 3 passed cargo test -p lance --lib remap # 21 passed ``` --- rust/lance/src/dataset/index/frag_reuse.rs | 107 +++++++++++ rust/lance/src/dataset/optimize/remapping.rs | 188 +++++++++++-------- 2 files changed, 212 insertions(+), 83 deletions(-) diff --git a/rust/lance/src/dataset/index/frag_reuse.rs b/rust/lance/src/dataset/index/frag_reuse.rs index ed6f027e159..ceebe456bbf 100644 --- a/rust/lance/src/dataset/index/frag_reuse.rs +++ b/rust/lance/src/dataset/index/frag_reuse.rs @@ -329,5 +329,112 @@ mod tests { .await .unwrap(); assert_eq!(frag_reuse_details.versions.len(), 0); + + // Data correctness, not just version bookkeeping: with the reuse index + // trimmed there is no auto-remap safety net, so each index must resolve + // to LIVE rows. An index whose data was not actually remapped (e.g. one + // whose bitmap was coverage-remapped by a sibling's commit before its + // own data remap) points at compacted-away fragments and errors on take. + use futures::TryStreamExt; + for col in ["i", "j"] { + let rows: usize = dataset + .scan() + .filter(&format!("{col} >= 2000 AND {col} < 3000")) + .unwrap() + .try_into_stream() + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum(); + assert_eq!( + rows, 1000, + "index {col}_idx must resolve to live rows after remap+trim" + ); + } + } + + /// When the reuse index has accumulated several versions, a single remap + /// must compose them and rebuild + commit the index exactly ONCE, not once + /// per version. + #[tokio::test] + async fn test_remap_index_batches_multiple_reuse_versions() { + let mut dataset = lance_datagen::gen_batch() + .col("i", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(8), FragmentRowCount::from(1000)) + .await + .unwrap(); + dataset + .create_index( + &["i"], + IndexType::Scalar, + Some("i_idx".into()), + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + // Accumulate multiple reuse versions: each round deletes a prefix, which + // shrinks fragments below target and forces another deferred compaction. + let options = CompactionOptions { + target_rows_per_fragment: 4_000, + defer_index_remap: true, + ..Default::default() + }; + for round in 0..4 { + dataset + .delete(&format!("i < {}", 1_000 * (round + 1))) + .await + .unwrap(); + compact_files(&mut dataset, options.clone(), None) + .await + .unwrap(); + } + + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + let num_versions = load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(); + assert!( + num_versions >= 2, + "test needs multiple reuse versions to exercise batching, got {num_versions}" + ); + + // A single remap must commit exactly once, regardless of version count. + let version_before = dataset.manifest.version; + remapping::remap_column_index(&mut dataset, &["i"], Some("i_idx".into())) + .await + .unwrap(); + let commits = dataset.manifest.version - version_before; + assert_eq!( + commits, 1, + "batched remap must commit once, not once per reuse version ({num_versions})" + ); + + // ... and the reuse index then trims to zero. + cleanup_frag_reuse_index(&mut dataset).await.unwrap(); + let frag_reuse_index_meta = dataset + .load_index_by_name(FRAG_REUSE_INDEX_NAME) + .await + .unwrap() + .expect("Fragment reuse index must be available"); + assert_eq!( + load_frag_reuse_index_details(&dataset, &frag_reuse_index_meta) + .await + .unwrap() + .versions + .len(), + 0 + ); } } diff --git a/rust/lance/src/dataset/optimize/remapping.rs b/rust/lance/src/dataset/optimize/remapping.rs index ca1ed54f30f..266ac977a69 100644 --- a/rust/lance/src/dataset/optimize/remapping.rs +++ b/rust/lance/src/dataset/optimize/remapping.rs @@ -220,32 +220,37 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { return Ok(()); } - // Sequentially apply the row addr maps from oldest to latest - let mut curr_index_id = *index_id; - for (i, row_id_map) in frag_reuse_index.row_id_maps.iter().enumerate() { - let version = &frag_reuse_index.details.versions[i]; - // load on-disk index metadata before auto-remap - let curr_index_meta = read_manifest_indexes( - &dataset.object_store, - &dataset.manifest_location, - &dataset.manifest, - ) - .await? - .into_iter() - .find(|idx| idx.uuid == curr_index_id) - .unwrap(); - - // Whether the index data predates this reuse version, i.e. its stored - // row addresses still point at the compacted-away fragments. The - // fragment bitmap alone cannot tell us this: `load_indices` - // coverage-remaps the bitmap onto the new fragments in memory, and a - // later commit can persist that cleaned bitmap to disk without the index - // data ever being remapped (e.g. while remapping a *sibling* index). - let data_predates_version = curr_index_meta.dataset_version < version.dataset_version; - let maybe_index_bitmap = curr_index_meta.fragment_bitmap.clone(); - let (should_remap, bitmap_after_remap) = match maybe_index_bitmap { - Some(mut index_frag_bitmap) => { - let mut should_remap = false; + // Read the index's on-disk metadata once. Its stored row addresses are at + // this baseline; we compose all reuse versions into a single remap so the + // index file is rebuilt and committed exactly once, rather than once per + // version (the reuse index can accumulate many versions before remap runs). + let curr_index_meta = read_manifest_indexes( + &dataset.object_store, + &dataset.manifest_location, + &dataset.manifest, + ) + .await? + .into_iter() + .find(|idx| idx.uuid == *index_id) + .ok_or_else(|| { + Error::index(format!( + "index {index_id} not found in manifest; it may have been concurrently dropped" + )) + })?; + + // Compose the coverage (fragment bitmap) remap across every reuse version in + // one pass. Chaining is automatic: a version inserts its new fragments, + // which a later version then sees as its old fragments. `data_predates_version` + // is evaluated against the fixed baseline (there are no intermediate + // commits), and the new-fragment branch handles a bitmap that was already + // coverage-remapped + persisted before the data was remapped (e.g. while + // remapping a *sibling* index). + let baseline_version = curr_index_meta.dataset_version; + let (should_remap, bitmap_after_remap) = match curr_index_meta.fragment_bitmap.clone() { + Some(mut index_frag_bitmap) => { + let mut should_remap = false; + for version in frag_reuse_index.details.versions.iter() { + let data_predates_version = baseline_version < version.dataset_version; for group in version.groups.iter() { let mut old_frag_in_index = 0; for old_frag in group.old_frags.iter() { @@ -265,8 +270,7 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { group.old_frags ))); } - index_frag_bitmap - .extend(group.new_frags.clone().into_iter().map(|f| f.id as u32)); + index_frag_bitmap.extend(group.new_frags.iter().map(|f| f.id as u32)); should_remap = true; } else if data_predates_version && group @@ -277,68 +281,86 @@ async fn remap_index(dataset: &mut Dataset, index_id: &Uuid) -> Result<()> { // The bitmap was already coverage-remapped onto this // group's new fragments and persisted before the data was // remapped, so the old fragments are gone from the bitmap - // but the index data still needs remapping. Without this - // the data remap is silently skipped and the reuse index - // can never be trimmed. + // but the index data still needs remapping. should_remap = true; } } - (should_remap, Some(index_frag_bitmap)) } - // if there is no fragment bitmap for the index, - // we attempt remapping but will not update the fragment bitmap. - None => (true, None), - }; - - if should_remap { - let remap_result = index::remap_index(dataset, &curr_index_id, row_id_map).await?; - - let new_index_meta = match remap_result { - RemapResult::Drop => continue, - RemapResult::Keep(new_id) => IndexMetadata { - uuid: new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: curr_index_meta.index_details.clone(), - index_version: curr_index_meta.index_version, - created_at: curr_index_meta.created_at, - base_id: None, - files: curr_index_meta.files.clone(), - }, - RemapResult::Remapped(remapped_index) => IndexMetadata { - uuid: remapped_index.new_id, - name: curr_index_meta.name.clone(), - fields: curr_index_meta.fields.clone(), - dataset_version: dataset.manifest.version, - fragment_bitmap: bitmap_after_remap, - index_details: Some(Arc::new(remapped_index.index_details)), - index_version: remapped_index.index_version as i32, - created_at: curr_index_meta.created_at, - base_id: None, - files: remapped_index.files, - }, - }; - - let new_id = new_index_meta.uuid; + (should_remap, Some(index_frag_bitmap)) + } + // if there is no fragment bitmap for the index, + // we attempt remapping but will not update the fragment bitmap. + None => (true, None), + }; - let transaction = Transaction::new( - dataset.manifest.version, - Operation::CreateIndex { - new_indices: vec![new_index_meta], - removed_indices: vec![curr_index_meta.clone()], - }, - None, - ); + if !should_remap { + return Ok(()); + } - dataset - .apply_commit(transaction, &Default::default(), &Default::default()) - .await?; + // Compose the row-address remap across all versions. `remap_row_id` already + // chains every version (and passes through addresses a version does not + // touch), so mapping the union of all versions' keys yields a single + // baseline -> final address map applied in one rebuild. + // + // Map every old address; do NOT filter by the current `fragment_bitmap`. In + // the sibling-coverage-remap case the bitmap was already advanced onto the + // new fragments while the index data still holds old addresses, so filtering + // by it would drop exactly the keys this index needs and leave its data + // stale (an empty map makes `index::remap_index` return `Keep`). The map is + // bounded by the rows the reuse index touched; addresses this index does not + // store are simply never looked up. + let composed_row_id_map: HashMap> = frag_reuse_index + .row_id_maps + .iter() + .flat_map(|row_id_map| row_id_map.keys().copied()) + .map(|old_addr| (old_addr, frag_reuse_index.remap_row_id(old_addr))) + .collect(); + + let remap_result = index::remap_index(dataset, index_id, &composed_row_id_map).await?; + + let new_index_meta = match remap_result { + // The composed remap emptied the index (every row deleted). Matching the + // prior per-version behavior, leave the existing index untouched and + // commit nothing -- there is no remap to apply. + RemapResult::Drop => return Ok(()), + RemapResult::Keep(new_id) => IndexMetadata { + uuid: new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: curr_index_meta.index_details.clone(), + index_version: curr_index_meta.index_version, + created_at: curr_index_meta.created_at, + base_id: None, + files: curr_index_meta.files.clone(), + }, + RemapResult::Remapped(remapped_index) => IndexMetadata { + uuid: remapped_index.new_id, + name: curr_index_meta.name.clone(), + fields: curr_index_meta.fields.clone(), + dataset_version: dataset.manifest.version, + fragment_bitmap: bitmap_after_remap, + index_details: Some(Arc::new(remapped_index.index_details)), + index_version: remapped_index.index_version as i32, + created_at: curr_index_meta.created_at, + base_id: None, + files: remapped_index.files, + }, + }; - curr_index_id = new_id; - } - } + let transaction = Transaction::new( + dataset.manifest.version, + Operation::CreateIndex { + new_indices: vec![new_index_meta], + removed_indices: vec![curr_index_meta], + }, + None, + ); + + dataset + .apply_commit(transaction, &Default::default(), &Default::default()) + .await?; Ok(()) } From 0f2745d10a0fe5b34a1cf214466bbc0c0d13c90c Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Wed, 17 Jun 2026 19:55:10 +0000 Subject: [PATCH 134/177] chore: release beta version 8.0.0-beta.17 --- .bumpversion.toml | 2 +- Cargo.lock | 227 +++++++------------------------------- Cargo.toml | 44 ++++---- java/lance-jni/Cargo.lock | 213 ++++++----------------------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 213 ++++++----------------------------- python/Cargo.toml | 2 +- 8 files changed, 126 insertions(+), 579 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index fe30629b529..d02ab87bd72 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.16" +current_version = "8.0.0-beta.17" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 63819e6f678..2ef17bb2d2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3378,17 +3378,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3765,7 +3763,7 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] @@ -4025,12 +4023,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -4388,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "all_asserts", "approx", @@ -4491,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4539,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrayref", "paste", @@ -4548,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4588,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4621,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4640,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "proc-macro2", "quote", @@ -4649,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -4694,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "all_asserts", "arrow", @@ -4720,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -4759,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "datafusion", "geo-traits", @@ -4773,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "approx", "arc-swap", @@ -4850,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-arith", @@ -4898,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "approx", "arrow-array", @@ -4917,7 +4909,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "async-trait", @@ -4929,7 +4921,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-schema", @@ -4945,7 +4937,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -5009,7 +5001,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -5027,7 +5019,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -5073,7 +5065,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "proc-macro2", "quote", @@ -5082,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-schema", @@ -5095,7 +5087,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5107,7 +5099,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "clap", "lance-core", @@ -5127,12 +5119,6 @@ dependencies = [ "spin 0.9.8", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -7044,7 +7030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -7460,7 +7446,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams 0.4.2", "web-sys", - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] @@ -8631,7 +8617,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -9324,12 +9310,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unicode_categories" version = "0.1.1" @@ -9410,7 +9390,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -9489,16 +9469,7 @@ version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -9565,28 +9536,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -9613,18 +9562,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.13.0", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" version = "0.3.102" @@ -9647,9 +9584,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] @@ -9660,14 +9597,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.7", + "webpki-roots 1.0.8", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -10034,100 +9971,12 @@ dependencies = [ "url", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.118", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.118", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.13.0", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -10222,7 +10071,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 6e79a26e69f..4f6043acf1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.16", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.16", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.16", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.16", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.16", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.16", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.16", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.16", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.16", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.16", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.16", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.16", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.16", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.16", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.17", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.17", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.17", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.17", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.17", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.17", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.17", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.17", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.17", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.17", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.17", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.17", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.17", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.17", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.16", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.16", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.16", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.16", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.16", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.17", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.17", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.17", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.17", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.17", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.16", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.17", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.16", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.17", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 5dba72718b3..682c39f3fda 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "rand 0.9.4", @@ -2773,17 +2773,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3374,12 +3372,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -3673,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arc-swap", "arrow", @@ -3746,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -3788,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrayref", "paste", @@ -3797,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -3835,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -3867,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -3884,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "proc-macro2", "quote", @@ -3893,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -3928,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -3958,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "datafusion", "geo-traits", @@ -3972,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arc-swap", "arrow", @@ -4040,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-arith", @@ -4081,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4117,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4132,7 +4124,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "async-trait", @@ -4144,7 +4136,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-ipc", @@ -4193,7 +4185,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4208,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4245,7 +4237,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "icu_segmenter", "rust-stemmers", @@ -4262,12 +4254,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -5617,7 +5603,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -6931,7 +6917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -7470,12 +7456,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -7530,7 +7510,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -7594,16 +7574,7 @@ version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -7670,28 +7641,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -7718,18 +7667,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" version = "0.3.102" @@ -7752,18 +7689,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -8164,100 +8101,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -8352,7 +8201,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 12ae647ab58..ed0763650a8 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index c9e4dcf8a9a..e371d8d7722 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.16 + 8.0.0-beta.17 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index f4e52846476..c321f283b2d 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3162,17 +3162,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" dependencies = [ "cfg-if 1.0.4", "js-sys", "libc", "r-efi 6.0.0", "rand_core 0.10.1", - "wasip2", - "wasip3", "wasm-bindgen", ] @@ -3763,12 +3761,6 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a2c462a4d927d512f5f882a033ddd62f33a05bb9f230d98f736ac3dc85938f" -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -4075,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arc-swap", "arrow", @@ -4149,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4191,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrayref", "paste", @@ -4200,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4238,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4270,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4287,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "proc-macro2", "quote", @@ -4296,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -4331,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-arith", "arrow-array", @@ -4361,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "datafusion", "geo-traits", @@ -4375,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arc-swap", "arrow", @@ -4444,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-arith", @@ -4485,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4500,7 +4492,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "async-trait", @@ -4512,7 +4504,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-ipc", @@ -4561,7 +4553,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow-array", "arrow-buffer", @@ -4576,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "arrow", "arrow-array", @@ -4615,7 +4607,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "icu_segmenter", "jieba-rs", @@ -4634,12 +4626,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -6058,7 +6044,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" dependencies = [ "alloc-stdlib", "arrow", @@ -6314,7 +6300,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom 0.4.3", "rand_core 0.10.1", ] @@ -7733,7 +7719,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.4.3", "once_cell", "rustix", "windows-sys 0.61.2", @@ -8321,12 +8307,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" -[[package]] -name = "unicode-xid" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" - [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -8381,7 +8361,7 @@ version = "1.23.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "144d6b123cef80b301b8f72a9e2ca4370ddec21950d0a103dd22c437006d2db7" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.4.3", "js-sys", "serde_core", "wasm-bindgen", @@ -8445,16 +8425,7 @@ version = "1.0.4+wasi-0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" dependencies = [ - "wit-bindgen 0.57.1", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" -dependencies = [ - "wit-bindgen 0.51.0", + "wit-bindgen", ] [[package]] @@ -8521,28 +8492,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.14.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -8569,18 +8518,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags 2.13.0", - "hashbrown 0.15.5", - "indexmap 2.14.0", - "semver", -] - [[package]] name = "web-sys" version = "0.3.102" @@ -8603,18 +8540,18 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31141ce3fc3e300ae89b78c0dd67f9708061d1d2eda54b8209346fd6be9a92c" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" dependencies = [ "rustls-pki-types", ] [[package]] name = "webpki-roots" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" dependencies = [ "rustls-pki-types", ] @@ -8949,100 +8886,12 @@ dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - [[package]] name = "wit-bindgen" version = "0.57.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.14.0", - "prettyplease", - "syn 2.0.118", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.118", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags 2.13.0", - "indexmap 2.14.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.14.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] - [[package]] name = "wkb" version = "0.9.2" @@ -9137,7 +8986,7 @@ dependencies = [ "csv", "futures", "futures-util", - "getrandom 0.4.2", + "getrandom 0.4.3", "heapify", "itertools 0.14.0", "lazy_static", diff --git a/python/Cargo.toml b/python/Cargo.toml index db4f26d80c7..4447859bc38 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.16" +version = "8.0.0-beta.17" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 134042aabfd6c7ff3efae037bddcb37fbfa37e59 Mon Sep 17 00:00:00 2001 From: LuQQiu Date: Wed, 17 Jun 2026 13:56:40 -0700 Subject: [PATCH 135/177] feat(scalar): expose LogicalScalarIndex::try_new and load_named_scalar_segments (#7339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Make two existing scalar-index building blocks `pub` and re-export them from `index::scalar`: - `LogicalScalarIndex::try_new` — public constructor that merges several already-opened segments of one scalar index into a single searchable `ScalarIndex`. - `load_named_scalar_segments` — list the committed, dataset-intersecting segments of a named scalar index (length `1` = a single non-segmented index, `> 1` = an index split across multiple segments). ## Why A distributed query engine needs to (1) discover how many segments a named scalar index has and (2) open an explicit subset of those segments on each executor, then present them as one index. Both capabilities already exist inside lance — `load_named_scalar_segments` lists segments, and `LogicalScalarIndex` already unions per-segment search results and fragment coverage — they were just private. The actual "open this UUID subset" helper stays in the calling engine; it is pure glue over these two plus the already-public `Dataset::open_scalar_index`, so it does not need to live in lance. ## Notes - Purely additive. No behavior change to existing callers (`open_named_scalar_index` and `scalar_index_fragment_bitmap` already used both). - `index_intersects_dataset` and `Dataset::fragment_bitmap` remain private. - `cargo check`, `clippy`, and `fmt` clean. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 --- rust/lance/src/index/scalar.rs | 2 ++ rust/lance/src/index/scalar_logical.rs | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs index 9fb756ea154..ae2478589fb 100644 --- a/rust/lance/src/index/scalar.rs +++ b/rust/lance/src/index/scalar.rs @@ -12,6 +12,8 @@ pub(crate) mod zonemap; pub use inverted::{load_segment_details, load_segments}; +pub use crate::index::scalar_logical::{LogicalScalarIndex, load_named_scalar_segments}; + use std::sync::{Arc, LazyLock}; use uuid::Uuid; diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index 75465cc817c..f3a7b637202 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -31,7 +31,17 @@ pub struct LogicalScalarIndex { } impl LogicalScalarIndex { - fn try_new(name: String, column: String, segments: Vec>) -> Result { + /// Merge several already-opened segments of one scalar index into a single + /// searchable [`ScalarIndex`]. + /// + /// Used internally by `open_named_scalar_index`, and exposed so a + /// distributed query engine can open an explicit subset of a scalar + /// index's segments and present them as one index. + pub fn try_new( + name: String, + column: String, + segments: Vec>, + ) -> Result { let Some(first) = segments.first() else { return Err(Error::invalid_input(format!( "LogicalScalarIndex '{}' on column '{}' must contain at least one segment", @@ -210,7 +220,14 @@ fn index_intersects_dataset(index: &IndexMetadata, dataset: &Dataset) -> bool { .is_some_and(|index_bitmap| index_bitmap.intersection_len(&dataset.fragment_bitmap) > 0) } -async fn load_named_scalar_segments( +/// List the committed, dataset-intersecting segments of a named scalar index. +/// +/// Returns one [`IndexMetadata`] per usable segment. The result length is the +/// segment count: `1` means a single (non-segmented) index, `> 1` means the +/// index is split across multiple segments that a distributed engine may route +/// to different executors. All returned segments are validated to share the +/// same underlying index type. +pub async fn load_named_scalar_segments( dataset: &Dataset, column: &str, index_name: &str, From 909dea18b1de21a84f7574fab8335bab02dc48b8 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Wed, 17 Jun 2026 21:25:19 +0000 Subject: [PATCH 136/177] chore: release beta version 8.0.0-beta.18 --- .bumpversion.toml | 2 +- Cargo.lock | 48 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 40 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 40 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 90 insertions(+), 90 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index d02ab87bd72..aa1657223fc 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.17" +current_version = "8.0.0-beta.18" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 2ef17bb2d2f..4e8646c2ed6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4380,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "all_asserts", "approx", @@ -4483,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4531,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrayref", "paste", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4580,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4613,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4632,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "proc-macro2", "quote", @@ -4641,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -4686,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "all_asserts", "arrow", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "datafusion", "geo-traits", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "approx", "arc-swap", @@ -4842,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-arith", @@ -4890,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "approx", "arrow-array", @@ -4909,7 +4909,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "async-trait", @@ -4921,7 +4921,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-schema", @@ -4937,7 +4937,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -5001,7 +5001,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -5019,7 +5019,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -5065,7 +5065,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "proc-macro2", "quote", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-schema", @@ -5087,7 +5087,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5099,7 +5099,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 4f6043acf1a..e07965db278 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.17", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.17", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.17", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.17", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.17", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.17", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.17", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.17", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.17", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.17", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.17", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.17", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.17", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.17", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.18", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.18", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.18", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.18", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.18", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.18", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.18", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.18", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.18", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.18", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.18", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.18", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.18", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.18", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.17", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.17", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.17", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.17", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.17", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.18", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.18", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.18", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.18", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.18", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.17", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.18", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.17", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.18", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 682c39f3fda..01ee724f59e 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3665,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arc-swap", "arrow", @@ -3738,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -3780,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrayref", "paste", @@ -3789,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -3859,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -3876,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "proc-macro2", "quote", @@ -3885,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -3920,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -3950,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "datafusion", "geo-traits", @@ -3964,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-arith", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4124,7 +4124,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "async-trait", @@ -4136,7 +4136,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-ipc", @@ -4185,7 +4185,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4237,7 +4237,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index ed0763650a8..b17df027736 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index e371d8d7722..bfd82436e81 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.17 + 8.0.0-beta.18 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index c321f283b2d..4570b3bc929 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4067,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arc-swap", "arrow", @@ -4141,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4183,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrayref", "paste", @@ -4192,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "proc-macro2", "quote", @@ -4288,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-arith", "arrow-array", @@ -4353,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "datafusion", "geo-traits", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arc-swap", "arrow", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-arith", @@ -4477,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4492,7 +4492,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "async-trait", @@ -4504,7 +4504,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-ipc", @@ -4553,7 +4553,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow-array", "arrow-buffer", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "arrow", "arrow-array", @@ -4607,7 +4607,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6044,7 +6044,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index 4447859bc38..d43bbcf2001 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.17" +version = "8.0.0-beta.18" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 1f8711198346fe0af1aa89fc7189a7e18c56eec4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:21:36 -0700 Subject: [PATCH 137/177] chore(deps): bump lance-namespace from 0.8.5 to 0.8.6 in /python (#7343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit > [!WARNING] > Dependabot will stop supporting `python v3.9`! > > Please upgrade to one of the following versions: `v3.9`, `v3.10`, `v3.11`, `v3.12`, `v3.13`, or `v3.14`. > Bumps [lance-namespace](https://github.com/lance-format/lance-namespace) from 0.8.5 to 0.8.6.

Release notes

Sourced from lance-namespace's releases.

v0.8.6

What's Changed

New Features 🎉

Bug Fixes 🐛

New Contributors

Full Changelog: https://github.com/lance-format/lance-namespace/compare/v0.8.5...v0.8.6

Commits
  • 590a4eb chore: release version 0.8.6
  • f5ea043 feat(java): propagate source_task_size to generated Java clients (#356)
  • 89e0cab feat(spec): add source_task_size to RefreshMaterializedViewRequest (#355)
  • be99cb6 fix: pin central-publishing-maven-plugin to an existing version (#354)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=lance-namespace&package-manager=uv&previous-version=0.8.5&new-version=0.8.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/uv.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index 289ecdf3549..d7eb30d19f7 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1083,19 +1083,19 @@ wheels = [ [[package]] name = "lance-namespace" -version = "0.8.5" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "lance-namespace-urllib3-client" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d0/22/3d8eb4e913edf36cda416f1dca287147af508abe3ca89bf0e619b9fa9f54/lance_namespace-0.8.5.tar.gz", hash = "sha256:b4a5967afcbf9924300a0b9d2fb74c44a23f76907e8734ebed6e0e3a561b0df0", size = 11531, upload-time = "2026-06-11T16:20:26.77Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/12/f7ab93b29be3edbf5fc3610714bf2d06088e7f4524bfb38dfd6852458b08/lance_namespace-0.8.6.tar.gz", hash = "sha256:18232e721c8188145f4ec9389cc2dfbeeabf54a619d94885ea1b3375bee9f4af", size = 11529, upload-time = "2026-06-12T17:36:41.651Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/afc3cdc42fc2dcf885a9d3524bf2c3bd2a9df89b1668b1806dec5e436263/lance_namespace-0.8.5-py3-none-any.whl", hash = "sha256:6d3e2b8da586d06409494b56955a63c3152eeae2883cd2e8ba4e80d20dc0de0f", size = 13383, upload-time = "2026-06-11T16:20:26.004Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1b/5b1668ee2dc8910965f390640359112a31157092fcf8e000b89c79b58708/lance_namespace-0.8.6-py3-none-any.whl", hash = "sha256:571eae34f9aad70e5b05020416c2860889b9ec82993ccd0eb015e7b39c3ea309", size = 13383, upload-time = "2026-06-12T17:36:43.456Z" }, ] [[package]] name = "lance-namespace-urllib3-client" -version = "0.8.5" +version = "0.8.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "pydantic" }, @@ -1104,9 +1104,9 @@ dependencies = [ { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/44/6f/1291523488523656342d1b424b76b4d91f3af6413b3b4ada43b888a87043/lance_namespace_urllib3_client-0.8.5.tar.gz", hash = "sha256:29922ffb5b0621e24a83183454ec3e5a5828f46d91a95d58efc35db05dec4e62", size = 228595, upload-time = "2026-06-11T16:20:23.985Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/10/e2/62883d1f43a283ac08f00af993c6a2b92e4ca206fa1ccba032420d8dc578/lance_namespace_urllib3_client-0.8.5-py3-none-any.whl", hash = "sha256:8af211ddc6e73df713ffb59368c94780508e732b19dacb4239d937aaff2f8e3c", size = 369857, upload-time = "2026-06-11T16:20:25.006Z" }, + { url = "https://files.pythonhosted.org/packages/c5/90/1e27de15cd1b16785a1c7312beb0a59e75c8344a815f600f58173a565bd1/lance_namespace_urllib3_client-0.8.6-py3-none-any.whl", hash = "sha256:9d78249c3fb15aa3d15d668f78f04a275af3d08d800a7027492f37996ac4968b", size = 369950, upload-time = "2026-06-12T17:36:40.438Z" }, ] [[package]] From 6484a0baabf1125252347b85d2f94cfaf4109bb3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:21:52 -0700 Subject: [PATCH 138/177] chore(deps): bump geoarrow-rust-core from 0.6.1 to 0.6.3 in /python (#7342) > [!WARNING] > Dependabot will stop supporting `python v3.9`! > > Please upgrade to one of the following versions: `v3.9`, `v3.10`, `v3.11`, `v3.12`, `v3.13`, or `v3.14`. > Bumps [geoarrow-rust-core](https://geoarrow.org/geoarrow-rs/) from 0.6.1 to 0.6.3. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=geoarrow-rust-core&package-manager=uv&previous-version=0.6.1&new-version=0.6.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- python/uv.lock | 112 +++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 55 deletions(-) diff --git a/python/uv.lock b/python/uv.lock index d7eb30d19f7..57a73fc9a62 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -726,7 +726,7 @@ wheels = [ [[package]] name = "geoarrow-rust-core" -version = "0.6.1" +version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, @@ -735,60 +735,62 @@ dependencies = [ { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/77/2d/3e994dd76223fac0eb597a6f55647cca51bd5a4f446d09b668697f901724/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:84d972cc3dd45a797fd99588d7ee68f257e4083ebdcecad9ec773260067f71a6", size = 3570129, upload-time = "2025-12-03T18:51:07.148Z" }, - { url = "https://files.pythonhosted.org/packages/5f/2a/e19df203b4ffb225f39627e1bd1b89ce7b2220e39f1d6972692174820c57/geoarrow_rust_core-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bc0f382d4ed41e85d2d89fc2c7c8c3d046681c9a5e19350ce79e0e930cf69821", size = 3333881, upload-time = "2025-11-21T01:49:28.959Z" }, - { url = "https://files.pythonhosted.org/packages/52/98/b749a2165dfc5d9c54a1c19eb3e6a75b6d005ecde42289b25c1c355346b7/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80e719edcaf6698ed2b1aa9525bd97cf79e23a500a39b1e83566cd9a16a294d3", size = 3806366, upload-time = "2025-11-21T01:48:03.525Z" }, - { url = "https://files.pythonhosted.org/packages/84/93/7c0e42ba7d46208fb0f851e06c05de071962170f3a3b2a2260d8a3f66e7a/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d0f3546a15503329880063aca31266b301b0b781f618f832585bcd1c9efcc876", size = 3981800, upload-time = "2025-11-21T01:48:17.789Z" }, - { url = "https://files.pythonhosted.org/packages/de/43/9c5736569dead60b33e46b7c485e24804d950693df70dee306e153547789/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6937f3cabebf673f8b726d60d8ca160b46401de8b08c8e257be22772c12c2001", size = 5068955, upload-time = "2025-11-21T01:48:32.569Z" }, - { url = "https://files.pythonhosted.org/packages/71/5e/f26f9bea2af96b0d070e980dcc2196d369a678e06141ed260de5ca72bcc2/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f29ba92053e8ad4bd60d72188518f033ca4abc1f34eecebeb41ee7b790612e00", size = 4104946, upload-time = "2025-11-21T01:48:45.801Z" }, - { url = "https://files.pythonhosted.org/packages/fa/08/473796b3e0c03b35292220de88c8efa3e74d6174e807b26a371f2523a4b0/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14a5d05a312fbb76821566b1d144c64d0923fcbd790b2c7376ee11f62472b2fe", size = 3917533, upload-time = "2025-11-21T01:49:14.631Z" }, - { url = "https://files.pythonhosted.org/packages/b9/7a/7b62b839c3a9878a7d91b8395e0b7b04483e4bec687e073df0fbd4056583/geoarrow_rust_core-0.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:88fe8fd33b16a06e9b3b7638b51d24047f1d01af12cc2e3e2653140877bddef6", size = 4318837, upload-time = "2025-11-21T01:48:58.953Z" }, - { url = "https://files.pythonhosted.org/packages/ea/86/309c55a9c63f316e3a04949ade8847b8e5acbdd21645696911175f0e1814/geoarrow_rust_core-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:dbecc2487cc95526ac77797cd70c199e196811b0a9e877c1b61fcaca508575fa", size = 3320081, upload-time = "2025-11-21T01:49:58.861Z" }, - { url = "https://files.pythonhosted.org/packages/1a/ed/514cff089185d71242a62e774e2c59dda147baab65929851b66d72198d5d/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:e26ca240d7a6a0fa1b4f56a9ebe07b2e14fc7c1c9507aa862bd31ef14e0521f0", size = 3572326, upload-time = "2025-12-03T18:51:08.477Z" }, - { url = "https://files.pythonhosted.org/packages/77/21/22f8233235bd020db22b4f2bf888f9aeed08813eda7b8b421a6963bdc7e4/geoarrow_rust_core-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46876e3528685673e08b4cbc696dca7f22fb073a83318708b0eaf640107b923b", size = 3335166, upload-time = "2025-11-21T01:49:30.632Z" }, - { url = "https://files.pythonhosted.org/packages/bb/eb/0c2e40a6a1bd450347a8a9fc7648ca840710bc177ff6eed3fc5da6ef981a/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5502bd12ede712d9b4725753df4db231a0aa6d3e131079bc4b6452c436e37b7", size = 3800540, upload-time = "2025-11-21T01:48:05.583Z" }, - { url = "https://files.pythonhosted.org/packages/4c/42/22d3b8441bb7041a6fcdb4cf0a1108e150513a52f8a407715188412bc71f/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f04dd7dd03449dba6d15f7d35c6c708637ac05f125638f56206e876756cd4c5", size = 3984840, upload-time = "2025-11-21T01:48:19.719Z" }, - { url = "https://files.pythonhosted.org/packages/12/44/477b6b2389398dc983026a4ab7dbb7ec121284ad5fb864a1b7a4658c3881/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2afce33d0c3fa87d5d4d24d6617732e4297da3372b1746569b759f9b62aede1", size = 5067358, upload-time = "2025-11-21T01:48:34.373Z" }, - { url = "https://files.pythonhosted.org/packages/62/50/6995e9d11462635972b2fc09c8e1e510928563ca4fb0fd2c9145cf6ef771/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e63cdb661652a9836dc86cb5995ad269817d88b80f4cce6ed236a7f80f0aba", size = 4105773, upload-time = "2025-11-21T01:48:47.461Z" }, - { url = "https://files.pythonhosted.org/packages/a3/21/b369208495f213db0a0e7d563358307a706cc6af0cb9c897dacf28ae06a1/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adbaf97cb770aef69df8a16437c9faa67adb2b04856faf45bcb61d5b986101dc", size = 3914659, upload-time = "2025-11-21T01:49:16.35Z" }, - { url = "https://files.pythonhosted.org/packages/1d/49/fccb14c6ee9bb715451e4d5bbe3d571eb59a8a1abe21b2abe0d9d48a7fac/geoarrow_rust_core-0.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:202f35b301caa5154d95fd74424a1ef6449306e4f6fbfb5140270e48e94188a5", size = 4315153, upload-time = "2025-11-21T01:49:01.075Z" }, - { url = "https://files.pythonhosted.org/packages/c0/1c/88b16510e24a4a3332284669085673701b9fe4d6a511b4466c90655a9daf/geoarrow_rust_core-0.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:491405dfcc821a2c599e381cc9923e04a758deb1cc84fdb5794b519446c2f8a8", size = 3320510, upload-time = "2025-11-21T01:50:15.545Z" }, - { url = "https://files.pythonhosted.org/packages/cb/5f/1dbdbc1dde2140937cff20188cb25034b6f39e1734c14ca6510cf464bf77/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a8145a562e94419402dd0882bb62429853804c53d47dbea944f2a24abc57abd2", size = 3568115, upload-time = "2025-12-03T18:51:09.743Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e1/b62676f89ef3b866676967989ee8dbbd3d16c77f69aa4287825703268c42/geoarrow_rust_core-0.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:51040a5afcfa0cd3ab372d981375c7fe8eb652d155e3964d52ed51d14faa04e8", size = 3325336, upload-time = "2025-11-21T01:49:32.67Z" }, - { url = "https://files.pythonhosted.org/packages/1f/89/94e20f255712ff0eaccf9bfeac4bf51953ebcef0599cfc92f67037f8ab1a/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fbf8506848b0254b3c89b27c045be38bbef6372b21714cad45d76b0c8cb92ce", size = 3808535, upload-time = "2025-11-21T01:48:07.618Z" }, - { url = "https://files.pythonhosted.org/packages/e7/e4/37c7e2c9e251148be17292d0656d7d1ab35019678f6bd11090a41c270d18/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c1a0d9c14bf2f36676016c753517d9470381969c2a67859716cceae33735f3ee", size = 3978997, upload-time = "2025-11-21T01:48:21.551Z" }, - { url = "https://files.pythonhosted.org/packages/71/27/c4ba353d9b77889136bdfd1c0cd1a04d6eade9da6e0748b06719c458afb5/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6df97301782ecbaf5f2f0252011a9ff309471cde25435bdf1e17b29c263ebc16", size = 5066492, upload-time = "2025-11-21T01:48:36.142Z" }, - { url = "https://files.pythonhosted.org/packages/a6/81/34107fc9aacc489e41afed420202645675b41d85b46dc70d5ba222312791/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1948cfdd0e1c7d03a0c2067821dd536ab34d1e726515202e51fbd6b0d9f775f", size = 4106130, upload-time = "2025-11-21T01:48:49.144Z" }, - { url = "https://files.pythonhosted.org/packages/92/5f/2e348b884738fb213fb3b4745955baeeaf047aecb37639e39a4dd8f12d99/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95b1611b66c386cc6c74e990df4f114bcf24956a35e18e51bf6331c079a36688", size = 3913166, upload-time = "2025-11-21T01:49:18.228Z" }, - { url = "https://files.pythonhosted.org/packages/bf/81/fdda8bb5f84df82bc9e000435a88be46d46dda41eb5149f624ed96b7031c/geoarrow_rust_core-0.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1751357a1aaa26aeb5feb6f66873b6a2d369655039f7278dedcb692b512111cc", size = 4313573, upload-time = "2025-11-21T01:49:03.184Z" }, - { url = "https://files.pythonhosted.org/packages/a0/14/ca0bc7d3b158094e769ba2bbc43d203330e7e457ed67b50af97d3eac45df/geoarrow_rust_core-0.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:16fe159043a444579948864808ebec8c49ec167ec0df3cb772dfb88de268bc91", size = 3318746, upload-time = "2025-11-21T01:50:17.319Z" }, - { url = "https://files.pythonhosted.org/packages/85/b8/94e4f8fb32ef705cf65031a24c58cdc441042a68a794b74757a6561cbc60/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:6c1b692f76b613757438bf23cfe3be4a8715f0268afd8ad3ca0063c257a3be4b", size = 3568328, upload-time = "2025-12-03T18:51:11.291Z" }, - { url = "https://files.pythonhosted.org/packages/7c/45/a96e64f9febc3436766c5055508c4e823cce56577529d7b76c4e4f584bc4/geoarrow_rust_core-0.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a2b4f9a8cfe852a0ba9a667258307db9e354b470b7e0a03edffd0b7daf9b6f5", size = 3325879, upload-time = "2025-11-21T01:49:34.941Z" }, - { url = "https://files.pythonhosted.org/packages/58/c0/c719ce3fb4e982e28c71f65a80cf697d07d733336e6b74d7d1b8a7daf9d0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8248330f5c3e7ec5852d0a23c23b31a08395300ef9544109e2991317beddfee3", size = 3809144, upload-time = "2025-11-21T01:48:09.562Z" }, - { url = "https://files.pythonhosted.org/packages/e2/8e/2ab3563b2ffd13f2dd69c050a901de0a4bb325879531a66f56d30bc7337e/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:775e9fe45c06d02be59b1497c60aa4f7a7c1d460387bf5f63142faf39b8ad4ff", size = 3978886, upload-time = "2025-11-21T01:48:23.335Z" }, - { url = "https://files.pythonhosted.org/packages/db/0a/31625caa0a32e8e9e7aaf2514a840dda0dadf8e2452710ebc10e5f469494/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94de8fb01da3f22332eab28b03570c43cc36492ce482c254fe87e851ae21285b", size = 5065429, upload-time = "2025-11-21T01:48:37.896Z" }, - { url = "https://files.pythonhosted.org/packages/11/8d/ee247bd4ccf3b0791b8669357d440e3960d4dbd5cca940a2e226e8910c31/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c70a63d1d36687a53dc6c2933446b1435c187e4c616cd84844d89b6ba13bc4f6", size = 4105436, upload-time = "2025-11-21T01:48:50.874Z" }, - { url = "https://files.pythonhosted.org/packages/a9/fb/c1e92716ee5aa00d48b650f0cb43220a1bf4088c8d572dfc21d400b16723/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e505312f2761393fe5158242f3f2d77e9daa5cca63badd8d66e6d1d69fc17bf", size = 3913672, upload-time = "2025-11-21T01:49:19.873Z" }, - { url = "https://files.pythonhosted.org/packages/f8/6f/ef47f6070c5d5cf0d061d5f5ba95aed7e895e4720a784b84c911c0209fc0/geoarrow_rust_core-0.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a732e58549108df8267ab72fa6cc7c54e5a9e30b818d8d869e301a9de9d3029e", size = 4313496, upload-time = "2025-11-21T01:49:04.953Z" }, - { url = "https://files.pythonhosted.org/packages/3c/ac/2696b979623ea02129e342f8820c89d03fa5a253a913ad00b588d6dd2948/geoarrow_rust_core-0.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:9e1d6492b1388b9d5ae898728838ada78dbf2340d2e9dd25ad3df6ccdd058813", size = 3318780, upload-time = "2025-11-21T01:50:18.928Z" }, - { url = "https://files.pythonhosted.org/packages/4e/42/0cb3af24b01d3897a9eee6af5cc0676bf6b80364e0d4638e45a5fc873d35/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3748cc8e8cb2bcedaede27cefed6749d4eea93e358b49a2f0b061d8974dd1b91", size = 3560313, upload-time = "2025-12-03T18:51:12.897Z" }, - { url = "https://files.pythonhosted.org/packages/51/bc/33f8c918e46188707ab358752b993bee9184fa62e580998c1ec4c37885c1/geoarrow_rust_core-0.6.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1b0e232fe4e239ca435d0bab638934eee87d758024c1727ee24a2b8bc4d8bc7b", size = 3321855, upload-time = "2025-12-03T18:51:00.056Z" }, - { url = "https://files.pythonhosted.org/packages/f4/d7/aeb2a3922670ad57f62cb591bd0309a8300ceeec6efc7f925a563c9da672/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:843444ada2c7f7670fd9df3bdebd93e5247b376d1dd20c4fb3828632847ab78e", size = 3799057, upload-time = "2025-12-03T18:50:28.982Z" }, - { url = "https://files.pythonhosted.org/packages/76/08/606e55fc2a0e85b02e0fde7dec2014eb8f1463e8a823496d72a3095de73d/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:880641183a09ebfbca3a6357071f137d1a4b0f1ba606fb9127a01cf58faaef56", size = 3968892, upload-time = "2025-12-03T18:50:34.661Z" }, - { url = "https://files.pythonhosted.org/packages/10/1f/e75fd5b59e9e582190c11ec73c91728d96e90608a22e0aed7365439d9534/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6bb69024257d2fd20da691d1e15bcced874d278884218b64690256982fa30cb1", size = 5049247, upload-time = "2025-12-03T18:50:40.542Z" }, - { url = "https://files.pythonhosted.org/packages/7e/95/2257b9b148c8c6557387e67828a5096ebc519b997a158ffb67a0987589e5/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:85464a1bab81068789de5fb19684e43709d2ba6d64d5655aace7c50b35893d6d", size = 4099850, upload-time = "2025-12-03T18:50:45.341Z" }, - { url = "https://files.pythonhosted.org/packages/b9/07/8c8aaf8755ee7c137f0898823bd005ffb16edaa6accc0cc1a9a747d56ddc/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7eb773a101f1d9716d750bb326991885a7c4576e85d9a016a567a3b07380bf07", size = 3908308, upload-time = "2025-12-03T18:50:55.587Z" }, - { url = "https://files.pythonhosted.org/packages/dc/7e/b8f1933be03d9a3a6416edf29fc23d520e45f00fbde6bd8f0614ad6f8a69/geoarrow_rust_core-0.6.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:920e6fed857acd2145a8fca7c6fad17094873f586ac5efed7049ce43a7af4ff6", size = 4307178, upload-time = "2025-12-03T18:50:50.429Z" }, - { url = "https://files.pythonhosted.org/packages/df/95/a8ba3d7e51ec02ec954d0247c6021b36de5935a9a3845c1cf6c1348cd6e3/geoarrow_rust_core-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:9887119cc31a763c34ed8676d06434b47971517e86f8e35c640b494d05e7d5ac", size = 3316511, upload-time = "2025-12-03T18:51:18.831Z" }, - { url = "https://files.pythonhosted.org/packages/ea/6d/4b2f51d0e4ac683217852d79c3acef719ca116f418d9ce8f4dcc6d717716/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:420a720217b5a7ec6f7977cfe7e7a729c73381ed5e63112fdef33bd805b9cf8a", size = 3572216, upload-time = "2025-12-03T18:51:14.544Z" }, - { url = "https://files.pythonhosted.org/packages/f0/55/85a2948b10ad9ea347597f90355d8992745f00fedae54916205c8c9b80fb/geoarrow_rust_core-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fb9c8c6bba4e712edf475ce3c78bf13f7b10f750256f57deb29c3222eaef033", size = 3335928, upload-time = "2025-11-21T01:49:51.601Z" }, - { url = "https://files.pythonhosted.org/packages/4e/98/fdd6c34ff8acd878c31e9f5fe4792f49d437e0465e0b60c24d6cdc287ed7/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9547ead76eac906b7a583ee65fa137e6b8ed34c0f128c1745a290c451726f27", size = 3808249, upload-time = "2025-11-21T01:48:11.192Z" }, - { url = "https://files.pythonhosted.org/packages/8a/a1/fd6741b5c1d7d48b5f6ab58a994a91c86e29d19ee7bca2636590b8ac9a54/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eaa8e8f40ca8fcd367735cb4226c5aa5171a713d75bc2caab9a03bd9f59d7bf2", size = 3984081, upload-time = "2025-11-21T01:48:25.595Z" }, - { url = "https://files.pythonhosted.org/packages/91/1e/2b5a9b65bf19a79d212ea0fe60fa5632ec4c89bb64ee446272b47e5cd6ac/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08992719a2accbf993837a6aad615e3f2bf1954d2d9152e507dd79621c87e9d3", size = 5071749, upload-time = "2025-11-21T01:48:39.673Z" }, - { url = "https://files.pythonhosted.org/packages/08/7a/6b37f5e52300b60854b74f4cdc9fbe613c692a15c3ae42f1952f3849bc86/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:071c0e72c4c2047326ebec8d76ce2debcdd59e187207433c3a29ac2da861ca92", size = 4107621, upload-time = "2025-11-21T01:48:52.632Z" }, - { url = "https://files.pythonhosted.org/packages/e8/3e/f849642ef4e1f54bcc651903f19a219c3d2be68d27f4ceb282a07ebba7cd/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49d5a9e7b73c30dd1790a3e0faf30b7a4ee393c127c5a799d543653d1d80f0c", size = 3919352, upload-time = "2025-11-21T01:49:21.495Z" }, - { url = "https://files.pythonhosted.org/packages/84/c8/57318cb04d061788d5ba523984915c98523e9eb9b7ba4937ff3438e045ef/geoarrow_rust_core-0.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:909152922ee42197b8ae846a8b6c5383c6f3ab39fe627ec8539765e3a634de68", size = 4320006, upload-time = "2025-11-21T01:49:06.588Z" }, - { url = "https://files.pythonhosted.org/packages/13/9f/be16e191fdedbac4d9c01096327917a948625619423c666ec3db2191b4ab/geoarrow_rust_core-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:796c84184fe5e65e30df9f9f45aa8c1680f07689ea71ed1960faa7324fb67e52", size = 3321071, upload-time = "2025-11-21T01:50:20.844Z" }, + { url = "https://files.pythonhosted.org/packages/70/a7/9de5cdcb86089ef4d9a24940838a72ef0655d5be11b46dc4ee807b0d7772/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:e1dbbca927858c05ef4eaa5e13a3977a62183cfa3f17fe7b19dd2d88ecf24e91", size = 3855749, upload-time = "2026-06-11T19:24:32.965Z" }, + { url = "https://files.pythonhosted.org/packages/54/48/da86c2bd1db71849f003f5a8eb78ce54f7a33341d5b33ddcdb480b5aafb4/geoarrow_rust_core-0.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce7e126d340f335bcc108327cbf7264539e856cb6a299f59757a6ee8329f6643", size = 3710538, upload-time = "2026-06-11T19:24:34.925Z" }, + { url = "https://files.pythonhosted.org/packages/f6/65/7f8ecc05447a85f14643170de8a29715e7c3e732fbb7132617772d39eac7/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88eb7982c1345fc4c4b18d9895602f0148c9495fe7ac00df03a92c20c8058149", size = 4198382, upload-time = "2026-06-11T19:24:37.02Z" }, + { url = "https://files.pythonhosted.org/packages/41/57/b11fbb277fab166d8a8940bc1151bbd1aeef537e70c55f495ff85178f827/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c8368b91d4cab5cb5ad1b0f7369da4cec196d82bf73aa3823618a99c1bd4cf04", size = 4270350, upload-time = "2026-06-11T19:24:38.726Z" }, + { url = "https://files.pythonhosted.org/packages/6d/16/0c35e5aff4aca77d818b28d79f9ce20fe1c282ef26d6a2fcc764f3a55f26/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2955d82d0204197c8e96adbfb70f252fa5987821dd8f202e712a84bfb5b876d3", size = 5602389, upload-time = "2026-06-11T19:24:40.198Z" }, + { url = "https://files.pythonhosted.org/packages/e0/06/58e4d0c94f7d8897ca5e2469fe5db0dd937bfc3cd676dea43c6ce488effe/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacbc2231b03c674975d5a25ff549c367dd8c07147c41edb5461c8ebda693739", size = 4414385, upload-time = "2026-06-11T19:24:41.779Z" }, + { url = "https://files.pythonhosted.org/packages/09/65/902e986d01d4978e752c1d0d5b15873de712321ce3f61c285f491e4149b9/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f5726fd638563d11dfefd7d17dd769e679ac1efb868178791573de19d16b41f", size = 4251263, upload-time = "2026-06-11T19:24:43.556Z" }, + { url = "https://files.pythonhosted.org/packages/2c/f1/b1e0f93ea5288706f08ac7c01f332eb0feaa128251f3c2c9896e5f42cba5/geoarrow_rust_core-0.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df7a0319cdec5d0e4ffc3f17a171e16787e7719f85f82c8cf0035d873ec31e62", size = 4747229, upload-time = "2026-06-11T19:24:45.281Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/77ebd20cb5cf5eb18c5bb0e32e07f76ec915a728ea123e075365f0b6c53c/geoarrow_rust_core-0.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:19ce5fb18025480461253d0a03f20cbb635163214b5f193b0700bc1a407dfe4d", size = 3601298, upload-time = "2026-06-11T19:24:46.721Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/d50e482a56d9543119be40000bc405b725242b6056809bbee3a75eff2411/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d91b5249d5e1da53a79268759601c107beb69a8944dd3b5b225e9515ab63d519", size = 3856056, upload-time = "2026-06-11T19:24:48.331Z" }, + { url = "https://files.pythonhosted.org/packages/04/e3/f4de7795959d95d88b32b85740d5d2d6b0a2e17233258f0331aee6cb7b13/geoarrow_rust_core-0.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:14412f02c1e60c92d2f88bc9f92835cf6d80f1da37fe8ba462eafdb7bd570f3c", size = 3710092, upload-time = "2026-06-11T19:24:49.802Z" }, + { url = "https://files.pythonhosted.org/packages/b4/48/04888477c2a12fbe6a6f8898bd026facdc3a929b4e747d7b569e6d20dd58/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc5d6db2341568b1e44678ccc0ade1ca1e7660a2c186ebf8bf847acdb160f2cf", size = 4197891, upload-time = "2026-06-11T19:24:51.245Z" }, + { url = "https://files.pythonhosted.org/packages/fb/2d/c16b6eb6f9f2ab213dcd0cd2ac0dec2eae1e2ce5922b3fbeb7bb1ac2a865/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45f4193b9d6f6caae969d8448f3687a19f0998d757519a091df609c06ffa68a0", size = 4269771, upload-time = "2026-06-11T19:24:52.781Z" }, + { url = "https://files.pythonhosted.org/packages/47/fd/2ee73341c37d554ce8d0b67a95525700ec32194fa785261c17262afadfc8/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf9ca054562fb4610c8e5ea140fa1bf746ccc16de505d3a5684abd2fa11f9538", size = 5601846, upload-time = "2026-06-11T19:24:54.63Z" }, + { url = "https://files.pythonhosted.org/packages/67/05/229234ae7bf1d39306e41896f3055a2ae847707ce58f21bd0872b9a5764e/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ec9530fefb653f9a2e605cc26fc1c0d1ffa5c4923ec1037323ba9a16744f8ccc", size = 4413741, upload-time = "2026-06-11T19:24:56.015Z" }, + { url = "https://files.pythonhosted.org/packages/eb/5a/7875548a48231b02f909d3d8c7d74ba47867b2af3396e7aed59cd3b2b40d/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2813aceabb29567d96f29fd2d3099d6f8decd0f5f968ff81ed1a664751dc84a3", size = 4251434, upload-time = "2026-06-11T19:24:57.527Z" }, + { url = "https://files.pythonhosted.org/packages/bf/46/ed0370def1a950f185edda603a02276bb412a9c95ad5a052c9e919b2df78/geoarrow_rust_core-0.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:49686767d1379ff3b165f9d35a73e96fc25daba786ce27cf3359c5feac880fd0", size = 4746598, upload-time = "2026-06-11T19:24:58.979Z" }, + { url = "https://files.pythonhosted.org/packages/44/bc/3a1720be855d7d0011416b7f0a7b7e33546b0fc7320faf59b05e401adff7/geoarrow_rust_core-0.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:fd9cc8c47af736dd087575306088e73b28a720f52e5c3342968851ddd2fb5778", size = 3601329, upload-time = "2026-06-11T19:25:00.459Z" }, + { url = "https://files.pythonhosted.org/packages/24/b2/65db3af5fcc7d64ac7ac86d7debc6a90803bb076c8f7d4599c167be79fd6/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:86aaa60e5b6d99be08f9adc9e58bd088135e1dcfebd290085228ed8a0e93e90f", size = 3848323, upload-time = "2026-06-11T19:25:02.079Z" }, + { url = "https://files.pythonhosted.org/packages/27/9a/37bdd36d7feb9d591b9ccdc1952c6171b04dc777b999e2082b810eb1dd45/geoarrow_rust_core-0.6.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fec148cd63e616d9a7aa00c4ab08693eeec55aca7c9d700aa6451cd8001d0e08", size = 3707679, upload-time = "2026-06-11T19:25:03.594Z" }, + { url = "https://files.pythonhosted.org/packages/45/b7/8d2998284de21d0feb2a0935c41636f8ebf2b65723d8139026e7f9f3d5e8/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1944f3d548b6296e9fbd668602accae0ad68e49ee0f5b8df9e7ea4f474e4ae", size = 4190279, upload-time = "2026-06-11T19:25:05.21Z" }, + { url = "https://files.pythonhosted.org/packages/25/f3/140209f53a70f261ef1459b08eea25c4edef3ad9f6ec0924033b5285ee7e/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f5c04195cbedf5d1684a50203e862d979cda0d6218aac32f607d6e3f7cd65c8", size = 4264876, upload-time = "2026-06-11T19:25:06.654Z" }, + { url = "https://files.pythonhosted.org/packages/14/32/0097bfb92816ef91b38f7e757f65fe8456e56152ca51cd7a05b1be8a2e40/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:671c6be9cbc68295a68598fc8c6ddd875de063a795d64b2cfd10d36abd1ee324", size = 5586563, upload-time = "2026-06-11T19:25:08.376Z" }, + { url = "https://files.pythonhosted.org/packages/fd/86/508fe299aa44afe95399d9fa73cdbc7a451841803b8f1431e8c3d0b26ec1/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5f4726fbe09d545a507993f2f76c2be7812fef3c20c994ff33c32aaa96aaa212", size = 4402886, upload-time = "2026-06-11T19:25:10.302Z" }, + { url = "https://files.pythonhosted.org/packages/46/81/fc34afcce2b0f17424610405481f69f3c6e4d670c5c94170d71ed6719794/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0fa37a90312e7ca06921be56cee183c12c442b345fadd982480cd1f8ed2eede", size = 4247331, upload-time = "2026-06-11T19:25:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0d/af42431f80282a2f7e1f3e496c39483dd2362e11f8008c65033be9d2ba4c/geoarrow_rust_core-0.6.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3f41a8c0a9f3558d73537dcad83c88b29c2a169bcc7766dc677e8245a98a5e95", size = 4741954, upload-time = "2026-06-11T19:25:13.964Z" }, + { url = "https://files.pythonhosted.org/packages/cc/e5/be80aa4384f16be6a20828fd4cc67da18bd2266366f80c9bfefa481559f8/geoarrow_rust_core-0.6.3-cp312-cp312-win_amd64.whl", hash = "sha256:382f0914c75d84b87420aef7b6f11e8b5d4d58b5f5db7c8d199815e4dd282a42", size = 3599115, upload-time = "2026-06-11T19:25:15.357Z" }, + { url = "https://files.pythonhosted.org/packages/19/52/93bbf15979ce656d09821f02f82420957fdc99ee4cd37e5e2d8c99a324da/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:c11190008ed6a571b8ca4ef769198e95434dbe7c3caefa9acd5f0ceba1ed868f", size = 3848682, upload-time = "2026-06-11T19:25:16.914Z" }, + { url = "https://files.pythonhosted.org/packages/a8/1e/1665171a3756b1977b7240a8f518bbbdfa778dcc156e0f90d659723468fb/geoarrow_rust_core-0.6.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1308ad09912fb67a43ff7dd7dbc685ca8a8fbd8028d3876eb187b6b082a98a7b", size = 3707868, upload-time = "2026-06-11T19:25:22.483Z" }, + { url = "https://files.pythonhosted.org/packages/ec/38/e344ccb72473b8756c8f2dae3a8a9339e1821884a2a50befbad45150d178/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1247b961c61656596631ca3380d405f8d0a2f60f045f8b8a3a335b1a849dc55", size = 4189835, upload-time = "2026-06-11T19:25:24.116Z" }, + { url = "https://files.pythonhosted.org/packages/22/10/bc92b9fcdc628fa1ff7e234219701cd575b0a78da5fdf3a6c8884e5ca445/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c2cb90116255c3f74d5aee563405f3a440bd4eb75471adac13cd0c80a2564dc", size = 4265584, upload-time = "2026-06-11T19:25:25.628Z" }, + { url = "https://files.pythonhosted.org/packages/a6/ed/67edd70967851bef3ef9e35d8ccef242923ed69104ecb885ad3adf4de9a2/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a993d3a0964b8cf55a51bd404225dc3037b51f34b01c6bb1312611ce61f9b2d", size = 5586300, upload-time = "2026-06-11T19:25:27.32Z" }, + { url = "https://files.pythonhosted.org/packages/76/a6/a20fba654caa314b4688ad9dceb5e99fa7956bbf92b3059baa36e06c59b3/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbd153a3348d166ecb57b2770b69b17c2df14cf303d41cd9168adba77532a31b", size = 4402375, upload-time = "2026-06-11T19:25:28.799Z" }, + { url = "https://files.pythonhosted.org/packages/ed/5d/c8949bb5916ff80186c854792b9ddadc9f3069db09d31311f24d82ba7096/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb5aaf3a6f104145b4c5a3188b1be589849b2599626c0e40181a18fc2e79f68", size = 4246712, upload-time = "2026-06-11T19:25:31.015Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/c9b7afa2929b697a164ae18f35aba517bcab85efcf19cb48ffa5ac66642b/geoarrow_rust_core-0.6.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3b33be8308a479f3a3a6d3a664861d6b5f8b1ad8822798f5a7e5d9af0b924eb", size = 4742863, upload-time = "2026-06-11T19:25:32.468Z" }, + { url = "https://files.pythonhosted.org/packages/57/5c/55a8d753bff924959837c39c9aa37c7813c5929570a2629ae4ece811505f/geoarrow_rust_core-0.6.3-cp313-cp313-pyemscripten_2025_0_wasm32.whl", hash = "sha256:a090191ae224e8490a95e68038db7a14df8f0326706f10c2e958621bf6c06ef5", size = 1979216, upload-time = "2026-06-11T19:25:33.905Z" }, + { url = "https://files.pythonhosted.org/packages/71/c7/a9f93af9306fd3743a96cc61bfdd7fc9194c38026f7904c067d4b4a99f0c/geoarrow_rust_core-0.6.3-cp313-cp313-win_amd64.whl", hash = "sha256:2606d6f5afacdb49145b39d3e024efadf33f847b596c19c9b6d3030d6beb2721", size = 3599237, upload-time = "2026-06-11T19:25:35.452Z" }, + { url = "https://files.pythonhosted.org/packages/8a/7a/6993bd89e12d0b227b611a53c657b38e63f906dfca773accae3a1f3815a4/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:370cd1ef46bf18fa598f3038fe6f417b016da211ffe060f2b60e47dd2f684a34", size = 3854961, upload-time = "2026-06-11T19:25:37.045Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c4/92cbcabd2a6add1b69a76a22a349fa219bdfed8026dfab4b8ec230bf9943/geoarrow_rust_core-0.6.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4dbf733db0bc57859d1a34c4bc8c50805f19e60081496967588e43f1f606e885", size = 3708325, upload-time = "2026-06-11T19:25:38.638Z" }, + { url = "https://files.pythonhosted.org/packages/07/b3/8fc34c5efa95cd597328876b6295fbe280d4b71df615655aaa2cd1618881/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45ac6715e790b1ca9be38ceb8ee39cdfe29395d29c83541f7a1190812290d81d", size = 4196828, upload-time = "2026-06-11T19:25:40.329Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f2/bd2026862995ff96eb6b94d2fc56f7bf737d13f6bac9662481eaae23d079/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d14917d471dce8ee5a0976ec50b5da800bab0117bfd72bc56e23518a1dbbdb3a", size = 4265577, upload-time = "2026-06-11T19:25:41.91Z" }, + { url = "https://files.pythonhosted.org/packages/3e/01/73d69c5205a34e043026a73048d210f448a986ebb577deee7ceb1923fb5a/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:43a371299305388663131321f0d623fc70ca4a3840f973598946b5183e5ba4e4", size = 5592303, upload-time = "2026-06-11T19:25:43.503Z" }, + { url = "https://files.pythonhosted.org/packages/98/20/fe35466e526a5d363ebd9c9dd16985dbad7fd677b90e1f123a8180bceb44/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23eddb8dd65dfefb397762cc3c3f6bfaffb4271641bd9dc8043a9ab3aa4cd72a", size = 4409972, upload-time = "2026-06-11T19:25:45.114Z" }, + { url = "https://files.pythonhosted.org/packages/e5/c8/dc588827ad6e8dad75413bc1d35b5189c8a011a2be4827499a4ab9402253/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43ce7b3aaeb0e8c8ad7c37c84ceed49e10d0929a5a92042c3f6ec5ef33271de4", size = 4250885, upload-time = "2026-06-11T19:25:46.649Z" }, + { url = "https://files.pythonhosted.org/packages/e6/e2/a9923e4c5848ace6e3e6f09a40d3860955f7d836675affe35bc79bc27033/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c67201bd662e4732a822f91651111bc024329b3e71eba9f4eed19e58c9cf789b", size = 4742518, upload-time = "2026-06-11T19:25:48.098Z" }, + { url = "https://files.pythonhosted.org/packages/e6/c7/3112def9e93e88341210dd22b4d04c598fb4d0726adef2114b68157354d5/geoarrow_rust_core-0.6.3-cp314-cp314-pyemscripten_2026_0_wasm32.whl", hash = "sha256:8461e6d07a7b39ab099c9885a68d5e7983d4e83a82a42dd5b331c543683c9d6e", size = 1959191, upload-time = "2026-06-11T19:25:49.668Z" }, + { url = "https://files.pythonhosted.org/packages/ed/0f/de74ce2171c408e4b4a7660f69f6dfaa294797a18a209fa85b1ea79be141/geoarrow_rust_core-0.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:5d2fd45d09bf700e0ca4d30b51ebcd59fb8d1a9eb4a4d7b4fc5f53a6cca59475", size = 3603948, upload-time = "2026-06-11T19:25:51.078Z" }, + { url = "https://files.pythonhosted.org/packages/58/33/fff80f597a0efb30816e8acf153fa0751891b22abec13cb085cf5d4c48fa/geoarrow_rust_core-0.6.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:77a90e435db4ca0deceb3b239dd9ee4302ce19dbddfaa4bec2fad69a32d8a519", size = 3859560, upload-time = "2026-06-11T19:25:52.482Z" }, + { url = "https://files.pythonhosted.org/packages/38/44/e7b02d661718b49c7b1fa609d2efc5b61276cb0817aeb5cd6a3f50f4834b/geoarrow_rust_core-0.6.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0699909bba568e57a27ce39fa4bfa4dfdfec510aa97e17c1a62082bcfabd8fcf", size = 3713382, upload-time = "2026-06-11T19:25:54.405Z" }, + { url = "https://files.pythonhosted.org/packages/b6/89/877059911db5e119d9cb0237ace2ef5ce70452877d6ac684f25b827ea007/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:720b80031efc2a356727b179d0676c7b34539459806e5fd680121364eb226b91", size = 4201145, upload-time = "2026-06-11T19:25:55.783Z" }, + { url = "https://files.pythonhosted.org/packages/50/e2/9540486e62e1aab7737c103b08caf87fe3337d627a1e1c278a476839495a/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34543b11f9a383256fc5a98745d298a451bb6a0325a0d28156c2ede95a205ac6", size = 4271238, upload-time = "2026-06-11T19:25:57.333Z" }, + { url = "https://files.pythonhosted.org/packages/a2/0a/58b80698ac176834c718c383f9650fdefe517586fc044973e9243da6980f/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fdcad76f4ee70e6c0c3fa25efc106af0410fd33aea9e3291c0c54d7adee19a5d", size = 5607074, upload-time = "2026-06-11T19:25:58.868Z" }, + { url = "https://files.pythonhosted.org/packages/45/cb/7c9af6e1dc21ab73fcb9ef08f6ff8d081bd9aa05d6011743adff51e15cf1/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:181b0d39f1ac2974abd697b5bcd75a4cb5b787b94746d6436311a79e5b1c94ea", size = 4416546, upload-time = "2026-06-11T19:26:00.596Z" }, + { url = "https://files.pythonhosted.org/packages/00/4c/a80882daf7fafa515f103e2a9504f2f86c3f044efe195ab7f2b870ee95d6/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060a70669ed6363aa267c09c386af9eb641435832ea6abb64191826dcc4bb162", size = 4253545, upload-time = "2026-06-11T19:26:02.172Z" }, + { url = "https://files.pythonhosted.org/packages/12/14/2c63cffe79b5988e91ef90d1fd149270496de9ec4f5106a887d254787f4f/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e569b79e49c52898c6f7fc134d9704d9b24c0609fc9ce5ed9ed03eb80f510862", size = 4749440, upload-time = "2026-06-11T19:26:03.708Z" }, + { url = "https://files.pythonhosted.org/packages/6c/53/3d0c8f94fcd09e44af707716e10a64e30c855acaa38093468fb9518f1f83/geoarrow_rust_core-0.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:0457ce64df184727fdcad581ddb32004947588dd1495ea1133cf786f958a1197", size = 3604831, upload-time = "2026-06-11T19:26:05.088Z" }, ] [[package]] From 838f78b988142fa0a4e5498b6c456ae114204bcb Mon Sep 17 00:00:00 2001 From: FANNG Date: Thu, 18 Jun 2026 10:09:57 +0800 Subject: [PATCH 139/177] fix: avoid versioned describe table for namespace opens (#7250) --- .../java/org/lance/OpenDatasetBuilder.java | 4 +- .../namespace/DirectoryNamespaceTest.java | 48 +++++++++++++++++++ python/python/lance/__init__.py | 4 +- python/python/tests/test_namespace_dir.py | 43 +++++++++++++++++ 4 files changed, 96 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index baece0767a1..32fd5ca7635 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -216,8 +216,8 @@ private Dataset buildFromNamespaceClient() { // Call describe_table to get location and storage options DescribeTableRequest request = new DescribeTableRequest(); request.setId(tableId); - // Only set version if present - options.getVersion().ifPresent(v -> request.setVersion(Long.valueOf(v))); + // Do not set the dataset version here. Some namespace implementations only support describing + // the latest table metadata; the requested version is applied when opening the dataset below. DescribeTableResponse response = namespaceClient.describeTable(request); diff --git a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java index f425ddcc4f9..c622bac9fcd 100644 --- a/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java +++ b/java/src/test/java/org/lance/namespace/DirectoryNamespaceTest.java @@ -189,6 +189,33 @@ void testNamespaceId() { "namespaceId should contain 'DirectoryNamespace', got: " + namespaceId); } + @Test + void testOpenSpecificVersionDoesNotPassVersionToDescribeTable() throws Exception { + VersionRejectingNamespace versionRejectingNamespace = + new VersionRejectingNamespace(innerNamespaceClient); + namespaceClient = versionRejectingNamespace; + List tableId = Arrays.asList("test_table"); + + namespaceClient.createTable(new CreateTableRequest().id(tableId), createTestTableData()); + namespaceClient.insertIntoTable( + new InsertIntoTableRequest().id(tableId).mode("append"), createTestTableData()); + + try (Dataset versionOne = + Dataset.open() + .allocator(allocator) + .namespaceClient(namespaceClient) + .tableId(tableId) + .readOptions(new ReadOptions.Builder().setVersion(1L).build()) + .build()) { + assertEquals(1, versionOne.version()); + assertEquals(3, versionOne.countRows()); + } + + assertTrue( + versionRejectingNamespace.getDescribeTableCallCount() > 0, + "Expected describeTable to be called when opening through namespace"); + } + @Test void testCreateAndListNamespaces() { // Create a namespace @@ -1439,4 +1466,25 @@ private byte[] createVectorTableData(int numRows, int dim) throws Exception { return out.toByteArray(); } } + + private static class VersionRejectingNamespace extends CustomNamespace { + private final AtomicInteger describeTableCallCount = new AtomicInteger(); + + VersionRejectingNamespace(DirectoryNamespace inner) { + super(inner); + } + + @Override + public DescribeTableResponse describeTable(DescribeTableRequest request) { + describeTableCallCount.incrementAndGet(); + assertNull( + request.getVersion(), + "Dataset version should be passed to dataset open, not describeTable"); + return super.describeTable(request); + } + + int getDescribeTableCallCount() { + return describeTableCallCount.get(); + } + } } diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index f58b169a47a..be99eb05cc5 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -230,7 +230,9 @@ def dataset( "Both 'namespace_client' and 'table_id' must be provided together." ) - request = DescribeTableRequest(id=table_id, version=version) + # Resolve the latest table metadata here. The requested dataset version is + # applied by the lower-level dataset open path after namespace resolution. + request = DescribeTableRequest(id=table_id, version=None) response = namespace_client.describe_table(request) uri = response.location diff --git a/python/python/tests/test_namespace_dir.py b/python/python/tests/test_namespace_dir.py index a57879d3368..fa1bc93b422 100644 --- a/python/python/tests/test_namespace_dir.py +++ b/python/python/tests/test_namespace_dir.py @@ -1106,6 +1106,49 @@ def test_external_manifest_store_invokes_namespace_apis(use_custom): ), "describe_table_version should be called once when opening version 1" +def test_dataset_namespace_open_does_not_pass_version_to_describe_table(): + """Dataset versions are applied to dataset open, not namespace describe_table.""" + + class VersionRejectingNamespace(CustomNamespace): + def __init__(self, inner: lance.namespace.DirectoryNamespace): + super().__init__(inner) + self.describe_versions = [] + + def describe_table( + self, request: DescribeTableRequest + ) -> DescribeTableResponse: + self.describe_versions.append(request.version) + assert request.version is None + return super().describe_table(request) + + with tempfile.TemporaryDirectory() as tmpdir: + inner_ns_client = lance.namespace.DirectoryNamespace(root=tmpdir) + ns_client = VersionRejectingNamespace(inner_ns_client) + table_id = ["test_table"] + + table1 = pa.Table.from_pylist([{"a": 1}, {"a": 2}]) + ds = lance.write_dataset( + table1, namespace_client=ns_client, table_id=table_id, mode="create" + ) + assert ds.count_rows() == 2 + assert ds.version == 1 + + table2 = pa.Table.from_pylist([{"a": 3}]) + ds = lance.write_dataset( + table2, namespace_client=ns_client, table_id=table_id, mode="append" + ) + assert ds.count_rows() == 3 + assert ds.version == 2 + + version_one = lance.dataset( + namespace_client=ns_client, table_id=table_id, version=1 + ) + assert version_one.count_rows() == 2 + assert version_one.version == 1 + assert ns_client.describe_versions + assert all(version is None for version in ns_client.describe_versions) + + @pytest.mark.skipif( sys.platform == "win32", reason="Windows file locking prevents reliable concurrent filesystem operations", From c4e65645d75b17c572fee62809b013b3730370bb Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 18 Jun 2026 15:47:57 +0800 Subject: [PATCH 140/177] feat: support mixed-language FTS stop words (#7324) This PR extends FTS stop-word handling so ICU tokenization can remove built-in stop words across supported languages, while non-ICU tokenizers continue to use the configured language and existing custom stop-word override behavior. It also fills the missing built-in stop-word lists for languages already exposed by `Language`, without changing the public API or index protobuf format. --- Cargo.lock | 10 +++ python/Cargo.lock | 10 +++ python/python/tests/test_scalar_index.py | 45 +++++++++++ .../src/scalar/inverted/tokenizer.rs | 72 ++++++++++++++--- rust/lance-tokenizer/Cargo.toml | 1 + rust/lance-tokenizer/src/stop_word_filter.rs | 80 +++++++++++++++++-- .../src/stop_word_filter/stopwords.rs | 6 ++ 7 files changed, 208 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e8646c2ed6..165548bb4a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5094,6 +5094,7 @@ dependencies = [ "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -8400,6 +8401,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "str_stack" version = "0.1.1" diff --git a/python/Cargo.lock b/python/Cargo.lock index 4570b3bc929..c3319f5cba6 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4614,6 +4614,7 @@ dependencies = [ "lindera", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -7531,6 +7532,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 13b3de74838..b6e882633f5 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -871,6 +871,51 @@ def test_fts_custom_stop_words(tmp_path): assert len(results["_rowid"].to_pylist()) == 1 +def test_fts_stop_words_respect_language_for_simple_tokenizer(tmp_path): + data = pa.table({"text": ["the lance data", "的 lance data"]}) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="simple", + stem=False, + ) + + results = ds.to_table(full_text_query="the", with_row_id=True) + assert results.num_rows == 0 + + results = ds.to_table(full_text_query="的", with_row_id=True) + assert results["text"].to_pylist() == ["的 lance data"] + + +def test_fts_icu_stop_words_are_all_or_none(tmp_path): + data = pa.table({"text": ["the 的 lance data", "useful data"]}) + ds = lance.write_dataset(data, tmp_path / "enabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=True, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 0 + assert ds.to_table(full_text_query="lance", with_row_id=True).num_rows == 1 + + ds = lance.write_dataset(data, tmp_path / "disabled", mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=False, + ) + + assert ds.to_table(full_text_query="the", with_row_id=True).num_rows == 1 + assert ds.to_table(full_text_query="的", with_row_id=True).num_rows == 1 + + def test_rowid_order(dataset): dataset.create_scalar_index("doc", index_type="INVERTED", with_position=False) results = dataset.scanner( diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 6024747025b..5a2a701dc73 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -355,16 +355,7 @@ impl InvertedIndexParams { builder = builder.filter_dynamic(Stemmer::new(self.language)); } if self.remove_stop_words { - let stop_word_filter = match &self.custom_stop_words { - Some(words) => StopWordFilter::remove(words.iter().cloned()), - None => StopWordFilter::new(self.language).ok_or_else(|| { - Error::invalid_input(format!( - "removing stop words for language {:?} is not supported yet", - self.language - )) - })?, - }; - builder = builder.filter_dynamic(stop_word_filter); + builder = builder.filter_dynamic(self.stop_word_filter()?); } if self.ascii_folding { builder = builder.filter_dynamic(AsciiFoldingFilter); @@ -382,6 +373,19 @@ impl InvertedIndexParams { } } + fn stop_word_filter(&self) -> Result { + match &self.custom_stop_words { + Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())), + None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()), + None => StopWordFilter::new(self.language).ok_or_else(|| { + Error::invalid_input(format!( + "removing stop words for language {:?} is not supported yet", + self.language + )) + }), + } + } + fn build_base_tokenizer(&self) -> Result { match self.base_tokenizer.as_str() { "simple" => Ok(TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()), @@ -503,4 +507,52 @@ mod tests { stream.process(&mut |token| tokens.push(token.text.clone())); assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]); } + + #[test] + fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .base_tokenizer("simple".to_string()) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the 的 lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!( + tokens, + vec!["的".to_string(), "lance".to_string(), "data".to_string()] + ); + } + + #[test] + fn test_custom_stop_words_replace_language_builtins() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .custom_stop_words(Some(vec!["lance".to_string()])) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]); + } + + #[test] + fn test_icu_stop_words_use_all_builtin_lists() { + let mut tokenizer = InvertedIndexParams::default() + .stem(false) + .base_tokenizer("icu".to_string()) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_search("the 的 lance data"); + let mut tokens = Vec::new(); + while let Some(token) = stream.next() { + tokens.push(token.text.clone()); + } + assert_eq!(tokens, vec!["lance".to_string(), "data".to_string()]); + } } diff --git a/rust/lance-tokenizer/Cargo.toml b/rust/lance-tokenizer/Cargo.toml index 5edfe4a9f16..e1006cd93c7 100644 --- a/rust/lance-tokenizer/Cargo.toml +++ b/rust/lance-tokenizer/Cargo.toml @@ -17,6 +17,7 @@ jieba-rs = { workspace = true, optional = true } lindera = { workspace = true, optional = true } rust-stemmers = "1.2.0" serde = { workspace = true, features = ["derive"] } +stop-words = { version = "0.10.0", default-features = false, features = ["iso", "nltk"] } unicode-normalization = "0.1.25" [features] diff --git a/rust/lance-tokenizer/src/stop_word_filter.rs b/rust/lance-tokenizer/src/stop_word_filter.rs index 0c49330a619..2acf0b3dbd5 100644 --- a/rust/lance-tokenizer/src/stop_word_filter.rs +++ b/rust/lance-tokenizer/src/stop_word_filter.rs @@ -12,6 +12,34 @@ use std::sync::Arc; use crate::{Language, Token, TokenFilter, TokenStream, Tokenizer}; +fn all_stop_words() -> impl Iterator { + [ + stop_words::get("ar"), + stopwords::DANISH, + stopwords::DUTCH, + stopwords::ENGLISH, + stopwords::FINNISH, + stopwords::FRENCH, + stopwords::GERMAN, + stop_words::get("el"), + stopwords::HUNGARIAN, + stopwords::ITALIAN, + stopwords::NORWEGIAN, + stopwords::PORTUGUESE, + stop_words::get("ro"), + stopwords::RUSSIAN, + stopwords::SPANISH, + stopwords::SWEDISH, + stop_words::get("ta"), + stop_words::get("tr"), + stop_words::get("zh"), + stop_words::get("ja"), + stop_words::get("ko"), + ] + .into_iter() + .flat_map(|words| words.iter().copied()) +} + #[derive(Clone)] pub struct StopWordFilter { words: Arc>, @@ -20,28 +48,32 @@ pub struct StopWordFilter { impl StopWordFilter { pub fn new(language: Language) -> Option { let words = match language { + Language::Arabic => stop_words::get("ar"), Language::Danish => stopwords::DANISH, Language::Dutch => stopwords::DUTCH, - Language::English => &[ - "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", - "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", - "there", "these", "they", "this", "to", "was", "will", "with", - ], + Language::English => stopwords::ENGLISH, Language::Finnish => stopwords::FINNISH, Language::French => stopwords::FRENCH, Language::German => stopwords::GERMAN, + Language::Greek => stop_words::get("el"), Language::Hungarian => stopwords::HUNGARIAN, Language::Italian => stopwords::ITALIAN, Language::Norwegian => stopwords::NORWEGIAN, Language::Portuguese => stopwords::PORTUGUESE, + Language::Romanian => stop_words::get("ro"), Language::Russian => stopwords::RUSSIAN, Language::Spanish => stopwords::SPANISH, Language::Swedish => stopwords::SWEDISH, - _ => return None, + Language::Tamil => stop_words::get("ta"), + Language::Turkish => stop_words::get("tr"), }; Some(Self::remove(words.iter().map(|word| (*word).to_owned()))) } + pub fn all() -> Self { + Self::remove(all_stop_words().map(str::to_owned)) + } + pub fn remove>(words: W) -> Self { Self { words: Arc::new(words.into_iter().collect()), @@ -49,6 +81,42 @@ impl StopWordFilter { } } +#[cfg(test)] +mod tests { + use super::all_stop_words; + use crate::StopWordFilter; + use std::collections::HashSet; + + #[test] + fn test_external_stop_word_lists_are_available() { + let words = all_stop_words().collect::>(); + for word in ["إلى", "και", "acesta", "அவர்", "ama", "的", "ある", "그리고"] + { + assert!( + words.contains(word), + "built-in stop words should contain {word}" + ); + } + } + + #[test] + fn test_language_stop_word_lists_are_available() { + for (language, word) in [ + (crate::Language::Arabic, "إلى"), + (crate::Language::Greek, "και"), + (crate::Language::Romanian, "acesta"), + (crate::Language::Tamil, "அவர்"), + (crate::Language::Turkish, "ama"), + ] { + let filter = StopWordFilter::new(language).unwrap(); + assert!( + filter.words.contains(word), + "{language:?} should contain {word}" + ); + } + } +} + impl TokenFilter for StopWordFilter { type Tokenizer = StopWordFilterWrapper; diff --git a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs index 2ac3f4a28aa..227556ba527 100644 --- a/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs +++ b/rust/lance-tokenizer/src/stop_word_filter/stopwords.rs @@ -37,6 +37,12 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +pub const ENGLISH: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", +]; + pub const DANISH: &[&str] = &[ "og", "i", "jeg", "det", "at", "en", "den", "til", "er", "som", "på", "de", "med", "han", "af", "for", "ikke", "der", "var", "mig", "sig", "men", "et", "har", "om", "vi", "min", "havde", From 58861929e48ed11347732b095dc7d1640ea93c3a Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Thu, 18 Jun 2026 08:28:15 -0500 Subject: [PATCH 141/177] feat(mem-wal): snapshot-consistent as-of cut for fresh-tier membership (#7215) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Adds `FreshTierWatermark { active_generation, active_batch_count }` and `LsmScanner::contains_pks_at` (with `fresh_tier_block_list` threading the watermark) so a caller can evaluate fresh-tier PK membership against the **exact tier a prior scan observed**, instead of the live tier. ## Why The WAL block-list runs as two independent RPCs (the read arm and a supersession check) that each snapshot the live fresh tier at their own call time. Under concurrent writes the two snapshots disagree, so a base row can be dropped as "superseded" by the check while the arm never delivered a replacement — a transient missing row. The fix pins both phases to the same watermark; this PR is the lance half that lets the check reconstruct the arm's snapshot. ## How the watermark works The active memtable is the only fresh-tier source that grows between two reads; everything strictly below its generation (frozen memtables, flushed generations) is immutable. So the as-of filter: - includes in-memory/flushed sources **below** `active_generation` whole (immutable, fully observed), - bounds the **active** generation to its first `active_batch_count` batches (by append index), - excludes in-memory sources **above** `active_generation` and flushed generations `>= active_generation` (produced after the snapshot). It uses only `batch_store.len()` and the memtable generation — both always available on the read path — and only ever *excludes* rows the scan did not observe, so a stale watermark under-counts (a tolerable stale read) rather than over-counts (which would drop a row with no replacement). > Note: an earlier approach keyed the watermark on per-batch WAL positions (`wal_batch_mapping`), but that map is only populated by `mark_wal_flushed`, which is test/bench-only — empty in production. The generation + batch-count watermark avoids any write-path dependency. ## Grace-period pinning A flushed memtable could otherwise be evicted between the two reads, collapsing its per-batch boundaries and turning the active-generation bound into a wholesale `>=` exclusion (a stale read). Frozen memtables are now retained in memory for a configurable grace period (`frozen_memtable_grace`, default 3s) after flush and swept by an existing dispatcher ticker. The grace must be **strictly larger than the maximum query elapsed time** to guarantee snapshot isolation: while pinned, the generation is served batch-resolved from memory; once evicted, no in-flight read references it so the `>=` exclusion is safe. ## Tests `fresh_tier_block_list` as-of unit tests: active-memtable batch-count bound, newer-gen-excluded / lower-gen-included-whole, flushed-gen at/above active excluded; collector suppression of a flushed generation pinned in memory; frozen retained during grace then swept. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- .../benches/mem_wal/write/mem_wal_write.rs | 1 + .../src/dataset/mem_wal/memtable/flush.rs | 31 +- rust/lance/src/dataset/mem_wal/scanner.rs | 4 +- .../src/dataset/mem_wal/scanner/block_list.rs | 264 ++++++++++++++++-- .../src/dataset/mem_wal/scanner/builder.rs | 177 ++++++++---- .../src/dataset/mem_wal/scanner/collector.rs | 68 +++++ .../dataset/mem_wal/scanner/data_source.rs | 23 ++ rust/lance/src/dataset/mem_wal/write.rs | 220 ++++++++++++--- 8 files changed, 676 insertions(+), 112 deletions(-) diff --git a/rust/lance/benches/mem_wal/write/mem_wal_write.rs b/rust/lance/benches/mem_wal/write/mem_wal_write.rs index 24f3a0d7c8f..a92ec1f8847 100644 --- a/rust/lance/benches/mem_wal/write/mem_wal_write.rs +++ b/rust/lance/benches/mem_wal/write/mem_wal_write.rs @@ -649,6 +649,7 @@ fn bench_lance_memwal_write(c: &mut Criterion) { backpressure_log_interval: default_config .backpressure_log_interval, stats_log_interval: default_config.stats_log_interval, + frozen_memtable_grace: default_config.frozen_memtable_grace, enable_memtable, hnsw_params: default_config.hnsw_params, }; diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index be0a66d7d2c..bf462b6e226 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -1018,21 +1018,30 @@ impl MemTableFlusher { } } -/// Message to trigger flush of a frozen memtable to Lance storage. -pub struct TriggerMemTableFlush { - /// The frozen memtable to flush. - pub memtable: Arc, - /// Optional channel to notify when flush completes. - pub done: Option>>, +/// Message driving the background memtable-flush task. +pub enum TriggerMemTableFlush { + /// Flush a frozen memtable to Lance storage. + Flush { + /// The frozen memtable to flush. + memtable: Arc, + /// Optional channel to notify when flush completes. + done: Option>>, + }, + /// Periodic tick: evict frozen memtables whose post-flush grace has elapsed. + SweepExpired, } impl std::fmt::Debug for TriggerMemTableFlush { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TriggerMemTableFlush") - .field("memtable_gen", &self.memtable.generation()) - .field("memtable_rows", &self.memtable.row_count()) - .field("has_done", &self.done.is_some()) - .finish() + match self { + Self::Flush { memtable, done } => f + .debug_struct("TriggerMemTableFlush::Flush") + .field("memtable_gen", &memtable.generation()) + .field("memtable_rows", &memtable.row_count()) + .field("has_done", &done.is_some()) + .finish(), + Self::SweepExpired => f.write_str("TriggerMemTableFlush::SweepExpired"), + } } } diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs index f6942681223..aaf915fb81f 100644 --- a/rust/lance/src/dataset/mem_wal/scanner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner.rs @@ -48,7 +48,9 @@ pub use builder::LsmScanner; pub use collector::{ ActiveMemTableRef, InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector, }; -pub use data_source::{FlushedGeneration, LsmDataSource, LsmGeneration, ShardSnapshot}; +pub use data_source::{ + FlushedGeneration, FreshTierWatermark, LsmDataSource, LsmGeneration, ShardSnapshot, +}; pub use flushed_cache::FlushedMemTableCache; pub use fts_search::{LsmFtsSearchPlanner, SCORE_COLUMN}; pub use point_lookup::LsmPointLookupPlanner; diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index fe197772492..f7f957845e4 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -31,7 +31,7 @@ use lance_index::scalar::{ }; use uuid::Uuid; -use super::data_source::{LsmDataSource, LsmGeneration}; +use super::data_source::{FreshTierWatermark, LsmDataSource, LsmGeneration}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use crate::dataset::mem_wal::index::encode_pk_tuple; use crate::dataset::mem_wal::util::PK_INDEX_DIR; @@ -163,6 +163,9 @@ pub async fn compute_source_block_lists( // per-shard, so supersession is within-shard only). let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; + // Flushed PK-BTree opens are cold S3 reads; overlap them with + // `try_join_all`. Order is irrelevant — gens are sorted per-shard below. + let mut flushed_loads = Vec::new(); for source in sources { match source { LsmDataSource::BaseTable { .. } => has_base = true, @@ -184,15 +187,18 @@ pub async fn compute_source_block_lists( shard_id, generation, .. - } => { + } => flushed_loads.push(async move { let index = open_pk_index(path, session, flushed_cache).await?; - by_shard - .entry(*shard_id) - .or_default() - .push((*generation, GenMembership::OnDisk(index))); - } + Ok::<_, Error>((*shard_id, *generation, GenMembership::OnDisk(index))) + }), } } + for (shard_id, generation, membership) in futures::future::try_join_all(flushed_loads).await? { + by_shard + .entry(shard_id) + .or_default() + .push((generation, membership)); + } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. @@ -223,29 +229,91 @@ pub async fn compute_source_block_lists( /// reader can test any PK against these (via [`GenMembership::contains`]) to /// decide whether the fresh tier shadows it. The base source, if present, is /// skipped (it is what gets shadowed). +/// +/// When `watermarks` carries a watermark for a source's shard, membership is +/// bounded to it (see [`FreshTierWatermark`]): higher generations are excluded, +/// the active generation is bounded to its first `active_batch_count` batches, +/// and lower generations (frozen and flushed) are immutable and included whole. +/// A shard absent from `watermarks` (or `watermarks == None`) uses the live tier. pub async fn fresh_tier_block_list( sources: &[LsmDataSource], session: Option<&Arc>, flushed_cache: Option<&Arc>, + watermarks: Option<&HashMap>, ) -> Result> { - let mut memberships = Vec::new(); + // Membership per source, in source order (`None` = skipped). Flushed + // PK-BTree opens are cold S3 reads, so collect them tagged with their slot + // and overlap with `try_join_all` rather than opening one at a time. + let mut slots: Vec> = Vec::with_capacity(sources.len()); + let mut flushed_loads = Vec::new(); for source in sources { - let membership = match source { - LsmDataSource::BaseTable { .. } => continue, + match source { + LsmDataSource::BaseTable { .. } => slots.push(None), LsmDataSource::ActiveMemTable { batch_store, index_store, + shard_id, + generation, .. - } => in_memory_membership(batch_store, index_store), - LsmDataSource::FlushedMemTable { path, .. } => { - GenMembership::OnDisk(open_pk_index(path, session, flushed_cache).await?) + } => { + let membership = match watermarks.and_then(|m| m.get(shard_id)) { + None => Some(in_memory_membership(batch_store, index_store)), + Some(watermark) => { + let g = generation.as_u64(); + if g > watermark.active_generation { + // Rolled in after the snapshot; the arm never saw it. + None + } else if g == watermark.active_generation { + // Bound the active generation to the batches the arm saw. + Some(bounded_in_memory_membership( + batch_store, + index_store, + watermark.active_batch_count, + )) + } else { + // Lower (frozen) generations are immutable — include all. + Some(in_memory_membership(batch_store, index_store)) + } + } + }; + slots.push(membership); + } + LsmDataSource::FlushedMemTable { + path, + shard_id, + generation, + .. + } => { + // A generation at or above the active one was flushed after the + // snapshot; exclude it. Lower generations are immutable. The + // `==` case is the active generation flushed between the two + // reads: excluding the flushed copy loses nothing, since its + // rows are already captured by the in-memory arm above (bounded + // to `active_batch_count`). + let flushed_after_snapshot = watermarks + .and_then(|m| m.get(shard_id)) + .is_some_and(|watermark| generation.as_u64() >= watermark.active_generation); + if flushed_after_snapshot { + slots.push(None); + } else { + let slot = slots.len(); + slots.push(None); + flushed_loads.push(async move { + let index = open_pk_index(path, session, flushed_cache).await?; + Ok::<_, Error>((slot, GenMembership::OnDisk(index))) + }); + } } - }; - if !membership.is_empty() { - memberships.push(membership); } } - Ok(memberships) + for (slot, membership) in futures::future::try_join_all(flushed_loads).await? { + slots[slot] = Some(membership); + } + Ok(slots + .into_iter() + .flatten() + .filter(|membership| !membership.is_empty()) + .collect()) } /// Cross-source membership of an in-memory (active / frozen) memtable: a @@ -263,6 +331,26 @@ fn in_memory_membership( } } +/// As-of variant of [`in_memory_membership`] for the active generation under a +/// watermark: bounds visibility to the first `batch_count` batches — those a +/// prior scan observed before the memtable grew. A later append lands at a +/// higher row position and is excluded by the probe, so it can't shadow a base +/// row whose replacement the scan never delivered. `batch_count == 0` leaves the +/// membership empty. +fn bounded_in_memory_membership( + batch_store: &Arc, + index_store: &Arc, + batch_count: u64, +) -> GenMembership { + let max_visible_row = batch_count + .checked_sub(1) + .and_then(|last_batch| batch_store.max_visible_row(last_batch as usize)); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, + } +} + /// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed /// generation. Reuses the flushed dataset's (session-configured) object store /// and **its index cache**, then loads the sidecar directly by path through the @@ -446,7 +534,9 @@ mod tests { active_source(shard, 1, &[3]), ]; - let memberships = fresh_tier_block_list(&sources, None, None).await.unwrap(); + let memberships = fresh_tier_block_list(&sources, None, None, None) + .await + .unwrap(); // One membership per generation; together they cover pk=1,2,3 (not 4). assert_eq!(memberships.len(), 2); @@ -593,4 +683,142 @@ mod tests { "a not-yet-visible newer write must not shadow an older visible copy" ); } + + /// A fresh-tier watermark bounds the active generation to the first + /// `active_batch_count` batches — those the arm observed before the memtable + /// grew. A later append is invisible, so a base row is never dropped without + /// the arm having delivered its replacement. + #[tokio::test] + async fn fresh_tier_watermark_bounds_active_memtable_by_batch_count() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // Three single-row batches: pk=1 at batch 0, pk=2 at batch 1, pk=3 at + // batch 2 (appended after the arm). + let sources = vec![active_source(shard, 1, &[1, 2, 3])]; + + // Watermark at 2 batches of gen 1: pk=1,2 are members; pk=3 (batch 2) is not. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: 2, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); + assert!(blocks(&sets, 2).await); + assert!(!blocks(&sets, 3).await); + + // No watermark → live tier: all three are members. + let sets = fresh_tier_block_list(&sources, None, None, None) + .await + .unwrap(); + for id in [1, 2, 3] { + assert!(blocks(&sets, id).await); + } + } + + /// A generation above the active one rolled in after the snapshot and is + /// excluded whole; a lower one is immutable (frozen) and included whole + /// regardless of the active batch count. + #[tokio::test] + async fn fresh_tier_watermark_excludes_newer_gen_includes_lower_gen() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use std::collections::HashMap; + + let shard = Uuid::new_v4(); + // gen 3 newer (after snapshot), gen 2 == active (bounded to 1 batch), + // gen 1 lower/immutable (whole). Each id is its own batch. + let sources = vec![ + active_source(shard, 3, &[100]), + active_source(shard, 2, &[20, 21]), + active_source(shard, 1, &[1, 2]), + ]; + + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: 1, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&watermarks)) + .await + .unwrap(); + assert!(blocks(&sets, 1).await); // gen 1, whole + assert!(blocks(&sets, 2).await); // gen 1, whole + assert!(blocks(&sets, 20).await); // gen 2, batch 0 + assert!(!blocks(&sets, 21).await); // gen 2, batch 1 — past the watermark + assert!(!blocks(&sets, 100).await); // gen 3 — after the snapshot + } + + /// A flushed generation at or above the active generation was produced by a + /// flush after the snapshot and is excluded; one strictly below it is + /// immutable and included. + #[tokio::test] + async fn fresh_tier_watermark_excludes_flushed_at_or_above_active() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + use crate::dataset::{Dataset, WriteParams}; + use arrow_array::RecordBatchIterator; + use std::collections::HashMap; + + // A flushed generation 2 holding pk=5, staged as a flushed dataset with + // its standalone PK sidecar (what the on-disk membership probes). + let flushed_batch = id_batch(&[5]); + let schema = flushed_batch.schema(); + let tmp = tempfile::tempdir().unwrap(); + let path = format!("{}/gen2", tmp.path().to_str().unwrap()); + let reader = RecordBatchIterator::new(vec![Ok(flushed_batch.clone())], schema.clone()); + Dataset::write(reader, &path, Some(WriteParams::default())) + .await + .unwrap(); + write_pk_sidecar(&path, &[flushed_batch], &["id"]) + .await + .unwrap(); + + let shard = Uuid::new_v4(); + let sources = vec![LsmDataSource::FlushedMemTable { + path, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }]; + + // active_generation 2 (gen 2 flushed at/after the snapshot): excluded. + let at: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 2, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&at)) + .await + .unwrap(); + assert!(!blocks(&sets, 5).await); + + // active_generation 3 (gen 2 strictly below, immutable): included. + let above: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 3, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let sets = fresh_tier_block_list(&sources, None, None, Some(&above)) + .await + .unwrap(); + assert!(blocks(&sets, 5).await); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index 1ab0950baf8..e4940ebf706 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -20,7 +20,7 @@ use lance_core::{Error, Result, is_system_column}; use uuid::Uuid; use super::collector::{InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector}; -use super::data_source::ShardSnapshot; +use super::data_source::{FreshTierWatermark, ShardSnapshot}; use super::flushed_cache::FlushedMemTableCache; use super::planner::LsmScanPlanner; use super::point_lookup::LsmPointLookupPlanner; @@ -475,33 +475,61 @@ impl LsmScanner { /// hashes PKs itself. Flushed membership comes from the injected /// [`FlushedMemTableCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { + self.contains_pks_at(pks, None).await + } + + /// As-of variant of [`Self::contains_pks`]. Membership is evaluated against + /// a per-shard watermark on the fresh tier, supplied via `watermarks` (see + /// [`FreshTierWatermark`]), matching the tier a prior scan observed and + /// avoiding the two-snapshot skew that would drop a base row with no + /// delivered replacement. `None` evaluates against the live tier. + pub async fn contains_pks_at( + &self, + pks: &RecordBatch, + watermarks: Option<&HashMap>, + ) -> Result> { let sources = self.build_collector().collect()?; let memberships = super::block_list::fresh_tier_block_list( &sources, self.session.as_ref(), self.flushed_cache.as_ref(), + watermarks, ) .await?; let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns) .map_err(|e| Error::invalid_input(e.to_string()))?; - let mut contained = Vec::with_capacity(pks.num_rows()); - for row in 0..pks.num_rows() { - // Both in-memory and flushed generations probe by the same key (the - // typed value, or the encoded `Binary` tuple for a composite PK). - let values: Vec = pk_indices - .iter() - .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) - .collect::>() - .map_err(|e| Error::invalid_input(e.to_string()))?; - let key = super::block_list::on_disk_pk_key(&values)?; - let mut found = false; - for membership in &memberships { - if membership.contains(&key).await? { - found = true; - break; + // One key per row, in the index key space (typed value, or encoded + // `Binary` tuple for a composite PK). + let keys: Vec = (0..pks.num_rows()) + .map(|row| { + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) + .collect::>() + .map_err(|e| Error::invalid_input(e.to_string()))?; + super::block_list::on_disk_pk_key(&values) + }) + .collect::>()?; + + // A row is contained if any generation contains its key. Probe each + // generation once (batched), narrowing to still-unfound rows. + let mut contained = vec![false; keys.len()]; + let mut live: Vec = (0..keys.len()).collect(); + for membership in &memberships { + if live.is_empty() { + break; + } + let live_keys: Vec = live.iter().map(|&i| keys[i].clone()).collect(); + let mask = membership.contains_keys(&live_keys).await?; + let mut next_live = Vec::with_capacity(live.len()); + for (pos, &row) in live.iter().enumerate() { + if mask[pos] { + contained[row] = true; + } else { + next_live.push(row); } } - contained.push(found); + live = next_live; } Ok(contained) } @@ -604,39 +632,42 @@ mod tests { assert_eq!(memtable_ref.generation, 10); } - #[tokio::test] - async fn contains_pks_reports_fresh_tier_membership() { - use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; - use arrow_array::Int32Array; + /// Single-column `id: Int32` schema used by the PK-membership tests. + fn pk_schema() -> SchemaRef { use arrow_schema::{DataType, Field, Schema}; + Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])) + } - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - let id_batch = |ids: &[i32]| { - RecordBatch::try_new( - schema.clone(), - vec![Arc::new(Int32Array::from(ids.to_vec()))], - ) - .unwrap() - }; - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - let mut index = IndexStore::new(); - index.enable_pk_index(&[("id".to_string(), 0)]); - let b = id_batch(ids); - let (bp, off, _) = store.append(b.clone()).unwrap(); - index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); - InMemoryMemTableRef { - batch_store: Arc::new(store), - index_store: Arc::new(index), - schema: schema.clone(), - generation, - } - }; + /// A `RecordBatch` of `id` values against [`pk_schema`]. + fn id_pk_batch(ids: &[i32]) -> RecordBatch { + use arrow_array::Int32Array; + RecordBatch::try_new(pk_schema(), vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + /// An active/frozen memtable holding `ids` at `generation`, with a single + /// batch and a maintained primary-key index on `id`. + fn mk_pk_memtable(ids: &[i32], generation: u64) -> InMemoryMemTableRef { + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + let store = BatchStore::with_capacity(8); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + let b = id_pk_batch(ids); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + InMemoryMemTableRef { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: pk_schema(), + generation, + } + } + + #[tokio::test] + async fn contains_pks_reports_fresh_tier_membership() { // Fresh-tier only: active gen 2 (pk=1,2) + frozen gen 1 (pk=3). let shard = Uuid::new_v4(); let scanner = LsmScanner::without_base_table( - schema.clone(), + pk_schema(), "memory://t", vec![], vec!["id".to_string()], @@ -644,16 +675,68 @@ mod tests { .with_in_memory_memtables( shard, InMemoryMemTables { - active: mk(&[1, 2], 2), - frozen: vec![mk(&[3], 1)], + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3], 1)], }, ); // pk=1 (active), pk=4 (absent), pk=3 (frozen). - let result = scanner.contains_pks(&id_batch(&[1, 4, 3])).await.unwrap(); + let result = scanner + .contains_pks(&id_pk_batch(&[1, 4, 3])) + .await + .unwrap(); assert_eq!(result, vec![true, false, true]); } + /// `contains_pks_at` probes each generation once over the still-unfound + /// rows, so a multi-PK batch spanning several generations resolves to the + /// right per-row mask — and a watermark bounds which generations count. + #[tokio::test] + async fn contains_pks_at_batched_probe_respects_watermark() { + use crate::dataset::mem_wal::scanner::data_source::FreshTierWatermark; + + // active gen 2 (pk=1,2) + frozen gen 1 (pk=3,4). + let shard = Uuid::new_v4(); + let scanner = LsmScanner::without_base_table( + pk_schema(), + "memory://t", + vec![], + vec!["id".to_string()], + ) + .with_in_memory_memtables( + shard, + InMemoryMemTables { + active: mk_pk_memtable(&[1, 2], 2), + frozen: vec![mk_pk_memtable(&[3, 4], 1)], + }, + ); + + // Duplicate and out-of-order keys exercise the live-row narrowing: each + // generation only re-probes the rows earlier generations didn't claim. + let probe = id_pk_batch(&[4, 1, 9, 3, 2, 1]); + + // watermark=None → live tier: every PK present in either generation. + let live = scanner.contains_pks_at(&probe, None).await.unwrap(); + assert_eq!(live, vec![true, true, false, true, true, true]); + + // watermark at gen 1 → active gen 2 rolled in after the snapshot and is + // excluded; only the frozen gen 1 keys (3,4) remain members. + let watermarks: HashMap = [( + shard, + FreshTierWatermark { + active_generation: 1, + active_batch_count: u64::MAX, + }, + )] + .into_iter() + .collect(); + let bounded = scanner + .contains_pks_at(&probe, Some(&watermarks)) + .await + .unwrap(); + assert_eq!(bounded, vec![true, false, false, true, false, false]); + } + /// One active memtable with a maintained BTree on `id`, all rows visible. fn mk_indexed_memtable(schema: &SchemaRef, ids: &[i32], names: &[&str]) -> InMemoryMemTableRef { use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; diff --git a/rust/lance/src/dataset/mem_wal/scanner/collector.rs b/rust/lance/src/dataset/mem_wal/scanner/collector.rs index 2db4b4f277d..6645f159b12 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/collector.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/collector.rs @@ -229,6 +229,19 @@ impl LsmDataSourceCollector { .collect() } + /// True when `generation` for `shard_id` is still pinned in memory as a + /// frozen memtable. During the post-flush grace window a generation is both + /// committed to the manifest (a flushed source) and held in memory (an + /// in-memory source); it must be served only from memory — which preserves + /// the per-batch boundaries the flushed dataset has lost, so as-of reads + /// stay snapshot-bounded — and its on-disk copy skipped to avoid scanning + /// the generation twice. See `ShardWriterConfig::frozen_memtable_grace`. + fn flushed_gen_pinned_in_memory(&self, shard_id: &Uuid, generation: u64) -> bool { + self.in_memory_memtables + .get(shard_id) + .is_some_and(|mems| mems.frozen.iter().any(|f| f.generation == generation)) + } + /// Collect all data sources. /// /// Returns sources in a consistent order: @@ -246,6 +259,9 @@ impl LsmDataSourceCollector { for snapshot in &self.shard_snapshots { for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -284,6 +300,9 @@ impl LsmDataSourceCollector { } for flushed in &snapshot.flushed_generations { + if self.flushed_gen_pinned_in_memory(&snapshot.shard_id, flushed.generation) { + continue; + } let path = self.resolve_flushed_path(&snapshot.shard_id, &flushed.path); sources.push(LsmDataSource::FlushedMemTable { path, @@ -443,4 +462,53 @@ mod tests { 3 ); } + + /// During the post-flush grace window a generation is both committed to the + /// manifest (a flushed source) and still pinned in memory (a frozen + /// source). The collector must emit it once, from memory — so as-of reads + /// keep batch-resolved membership — and skip the on-disk copy. Flushed + /// generations NOT pinned in memory are still emitted from disk. + #[test] + fn test_collect_suppresses_flushed_gen_pinned_in_memory() { + let shard = Uuid::new_v4(); + // Manifest lists gens 1 and 2 as flushed; gen 2 is still pinned in + // memory (just flushed, within grace), gen 1 has been swept. + let snapshot = ShardSnapshot { + shard_id: shard, + spec_id: 0, + current_generation: 3, + flushed_generations: vec![ + FlushedGeneration { + generation: 1, + path: "gen_1".to_string(), + }, + FlushedGeneration { + generation: 2, + path: "gen_2".to_string(), + }, + ], + }; + let mems = InMemoryMemTables { + active: memtable_ref(3), + frozen: vec![memtable_ref(2)], + }; + let collector = LsmDataSourceCollector::without_base_table("/tmp/x", vec![snapshot]) + .with_in_memory_memtables(shard, mems); + + let sources = collector.collect().unwrap(); + // gen 1: on-disk (not pinned). gen 2: in-memory only (pinned, disk + // copy suppressed). gen 3: active. No duplicate gen 2. + let flushed: Vec = sources + .iter() + .filter(|s| !s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + let in_memory: Vec = sources + .iter() + .filter(|s| s.is_active_memtable()) + .map(|s| s.generation().as_u64()) + .collect(); + assert_eq!(flushed, vec![1], "only the unpinned flushed gen from disk"); + assert_eq!(in_memory, vec![2, 3], "pinned gen 2 served from memory"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs index 1a6207f27e3..0d5f3fdc925 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/data_source.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/data_source.rs @@ -11,6 +11,29 @@ use uuid::Uuid; use crate::dataset::Dataset; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; +/// A watermark marking how far into one shard's fresh tier a prior scan +/// observed, so membership can be evaluated as of that point (see +/// [`super::builder::LsmScanner::contains_pks_at`]). +/// +/// Only the active memtable grows between two reads (appended batches, and a new +/// generation when it rolls); everything at a lower generation — frozen and +/// flushed — is immutable and was fully observed. The watermark includes lower +/// generations whole, the active generation up to `active_batch_count` batches, +/// and excludes higher generations (which appeared after it). It uses only the +/// batch count and generation — both always available, unlike per-batch WAL +/// positions, which the write path does not track. The bound only excludes rows +/// the scan did not observe, so a stale watermark under-counts (a tolerable +/// stale read) rather than dropping a row with no replacement. +#[derive(Debug, Clone, Copy)] +pub struct FreshTierWatermark { + /// Active generation the scan observed. Higher generations are excluded; + /// lower ones are immutable and included whole. + pub active_generation: u64, + /// Active-memtable batch count at snapshot time. Within the active + /// generation, only batches at index `< active_batch_count` were observed. + pub active_batch_count: u64, +} + /// Generation number in LSM tree. /// /// The base table has generation 0. MemTables have positive integers diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 57acaf42ccd..227ba2e19d3 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -177,6 +177,21 @@ pub struct ShardWriterConfig { /// Default: 60 seconds pub stats_log_interval: Option, + /// How long a frozen memtable lingers in memory after its flush commits, + /// before it is evicted and served only from the on-disk flushed dataset. + /// + /// `Duration::ZERO` (the default) disables retention: evict on commit, no + /// sweep ticker. Correct for single-shot queries, which can't observe a + /// generation evicted mid-read. + /// + /// A non-zero value is required only for queries split across reads (e.g. + /// fresh tier and base table read separately, then deduped): the flushed + /// dataset loses the per-batch boundaries that bound as-of membership + /// (see [`crate::dataset::mem_wal::scanner::FreshTierWatermark`]), so a + /// generation evicted between a query's reads can serve a stale row. Set it + /// above the worst-case multi-part query latency, with margin. + pub frozen_memtable_grace: Duration, + /// Whether to maintain an in-memory MemTable on top of the WAL. /// /// When `true` (default), the writer maintains an in-memory `MemTable`, @@ -236,6 +251,7 @@ impl Default for ShardWriterConfig { async_index_buffer_rows: 10_000, async_index_interval: Duration::from_secs(1), stats_log_interval: Some(Duration::from_secs(60)), // 1 minute + frozen_memtable_grace: Duration::ZERO, enable_memtable: true, hnsw_params: HashMap::new(), } @@ -335,6 +351,13 @@ impl ShardWriterConfig { self } + /// Set how long a flushed memtable lingers in memory before eviction. MUST + /// exceed the maximum query elapsed time — see `frozen_memtable_grace`. + pub fn with_frozen_memtable_grace(mut self, grace: Duration) -> Self { + self.frozen_memtable_grace = grace; + self + } + /// Toggle the in-memory MemTable layer. See `enable_memtable` for the /// full WAL-only-mode contract. Defaults to `true`. pub fn with_enable_memtable(mut self, enable: bool) -> Self { @@ -708,6 +731,15 @@ pub struct WriteResult { pub batch_positions: std::ops::Range, } +/// A sealed memtable kept queryable in memory. `flushed_at_ms` is `None` while +/// the generation is still awaiting (or retrying) its flush, and `Some(t)` once +/// the flush commits — after which it lingers for `frozen_memtable_grace` so +/// in-flight as-of reads keep batch-resolved membership, then is swept. +struct FrozenMemTable { + memtable: Arc, + flushed_at_ms: Option, +} + /// ShardWriter state shared across tasks. struct WriterState { memtable: MemTable, @@ -716,12 +748,13 @@ struct WriterState { frozen_memtable_bytes: usize, /// Flush watchers for frozen memtables (for backpressure). frozen_flush_watchers: VecDeque<(usize, DurabilityWatcher)>, - /// Sealed-but-undrained memtables, kept queryable so a concurrent reader - /// sees no hole between `freeze_memtable` and the flush task's manifest - /// commit. Pushed in `freeze_memtable`; removed by generation in - /// `flush_memtable` on commit success only (retained on failure until a - /// later flush or WAL replay on reopen). - frozen_memtables: VecDeque>, + /// Sealed memtables, kept queryable so a concurrent reader sees no hole + /// between `freeze_memtable` and the flush task's manifest commit, and for + /// `frozen_memtable_grace` beyond it so as-of reads stay batch-resolved. + /// Pushed in `freeze_memtable`; stamped `flushed_at_ms` by `flush_memtable` + /// on commit success only (retained un-stamped on failure until a later + /// flush or WAL replay on reopen); swept after the grace by `SweepExpired`. + frozen_memtables: VecDeque, /// Flag to prevent duplicate memtable flush requests. flush_requested: bool, /// Counter for WAL flush threshold crossings. @@ -968,10 +1001,13 @@ impl SharedWriterState { let frozen_memtable = Arc::new(old_memtable); - // Keep this generation queryable until its manifest commit lands - // (dropped in `flush_memtable`, success only). Arc refcount, not a - // copy — the flush task holds it alive for the whole drain anyway. - state.frozen_memtables.push_back(frozen_memtable.clone()); + // Keep this generation queryable past its manifest commit (swept after + // the grace by `SweepExpired`). Arc refcount, not a copy — the flush + // task holds it alive for the whole drain anyway. + state.frozen_memtables.push_back(FrozenMemTable { + memtable: frozen_memtable.clone(), + flushed_at_ms: None, + }); debug!( "Frozen memtable generation {}, pending_count = {}", @@ -979,7 +1015,7 @@ impl SharedWriterState { state.frozen_flush_watchers.len() ); - let _ = self.memtable_flush_tx.send(TriggerMemTableFlush { + let _ = self.memtable_flush_tx.send(TriggerMemTableFlush::Flush { memtable: frozen_memtable, done: None, }); @@ -1399,8 +1435,14 @@ impl ShardWriter { // Background MemTable flush handler — frozen memtable to Lance file. // It rebuilds the same secondary indexes on each flushed generation. - let memtable_handler = - MemTableFlushHandler::new(state.clone(), flusher, epoch, index_configs.to_vec(), stats); + let memtable_handler = MemTableFlushHandler::new( + state.clone(), + flusher, + epoch, + index_configs.to_vec(), + stats, + config.frozen_memtable_grace, + ); task_executor.add_handler( "memtable_flusher".to_string(), Box::new(memtable_handler), @@ -1811,7 +1853,7 @@ impl ShardWriter { frozen: state .frozen_memtables .iter() - .map(|m| in_memory_ref(m)) + .map(|m| in_memory_ref(&m.memtable)) .collect(), }) } @@ -2204,6 +2246,9 @@ struct MemTableFlushHandler { /// at all. index_configs: Vec, stats: SharedWriteStats, + /// How long a frozen memtable lingers in memory after its flush commits + /// before `SweepExpired` evicts it. See `ShardWriterConfig::frozen_memtable_grace`. + grace: Duration, } impl MemTableFlushHandler { @@ -2213,6 +2258,7 @@ impl MemTableFlushHandler { epoch: u64, index_configs: Vec, stats: SharedWriteStats, + grace: Duration, ) -> Self { Self { state, @@ -2220,22 +2266,51 @@ impl MemTableFlushHandler { epoch, index_configs, stats, + grace, } } + + /// Evict frozen memtables whose post-flush grace has elapsed. Un-stamped + /// (not-yet-flushed) entries are always kept. + async fn sweep_expired_frozen(&self) { + let now = now_millis(); + let grace_ms = self.grace.as_millis() as u64; + let mut state = self.state.write().await; + state + .frozen_memtables + .retain(|frozen| match frozen.flushed_at_ms { + Some(flushed_at) => now.saturating_sub(flushed_at) < grace_ms, + None => true, + }); + } } #[async_trait] impl MessageHandler for MemTableFlushHandler { - async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { - let TriggerMemTableFlush { memtable, done } = message; + fn tickers(&mut self) -> Vec<(Duration, MessageFactory)> { + // Zero grace evicts on commit, so no sweeper is needed. + if self.grace.is_zero() { + return vec![]; + } + // Sweep often enough that eviction lags the grace by at most ~1/3, so a + // generation lives no more than ~grace * 4/3 past its flush commit. + let tick = (self.grace / 3).max(Duration::from_millis(100)); + vec![(tick, Box::new(|| TriggerMemTableFlush::SweepExpired))] + } - let result = self.flush_memtable(memtable).await; - if let Some(tx) = done { - // Send result through the channel - caller is waiting for it - let _ = tx.send(result); - } else { - // No done channel, propagate errors - result?; + async fn handle(&mut self, message: TriggerMemTableFlush) -> Result<()> { + match message { + TriggerMemTableFlush::Flush { memtable, done } => { + let result = self.flush_memtable(memtable).await; + if let Some(tx) = done { + // Send result through the channel - caller is waiting for it + let _ = tx.send(result); + } else { + // No done channel, propagate errors + result?; + } + } + TriggerMemTableFlush::SweepExpired => self.sweep_expired_frozen().await, } Ok(()) } @@ -2329,15 +2404,26 @@ impl MemTableFlushHandler { state.frozen_memtable_bytes = state.frozen_memtable_bytes.saturating_sub(memtable_size); } - // Drop the queryable handle ONLY on commit success. On failure - // keep it: rows must stay in the read union until a later flush - // or WAL replay, else a transient flush error reopens the hole. - // Keyed by generation, so non-FIFO completion is fine. + // Retire the frozen handle on commit success, keyed by generation + // (non-FIFO completion is fine). Zero grace evicts here; otherwise + // stamp the grace clock so it lingers for multi-part as-of reads + // until `SweepExpired`. On failure leave it un-stamped: rows stay in + // the read union until a later flush or WAL replay, else a transient + // error reopens the hole. if flush_result.is_ok() { let flushed_generation = memtable.generation(); - state - .frozen_memtables - .retain(|m| m.generation() != flushed_generation); + if self.grace.is_zero() { + state + .frozen_memtables + .retain(|frozen| frozen.memtable.generation() != flushed_generation); + } else { + let now = now_millis(); + for frozen in state.frozen_memtables.iter_mut() { + if frozen.memtable.generation() == flushed_generation { + frozen.flushed_at_ms = Some(now); + } + } + } } } @@ -4220,10 +4306,12 @@ mod tests { writer.close().await.unwrap(); } - /// On a successful flush commit the sealed generation is dropped from - /// the queryable set (no leak), and its rows land in the manifest. + /// On a successful flush commit the sealed generation's rows land in the + /// manifest immediately, but the in-memory handle is NOT dropped — it + /// lingers for `frozen_memtable_grace` (so in-flight as-of reads keep + /// batch-resolved membership), then is swept by the `SweepExpired` ticker. #[tokio::test] - async fn test_frozen_dropped_after_successful_flush() { + async fn test_frozen_retained_during_grace_then_swept() { let (store, base_path, base_uri, _temp_dir) = create_local_store().await; let schema = create_test_schema(); let config = ShardWriterConfig { @@ -4235,6 +4323,8 @@ mod tests { max_wal_flush_interval: None, max_memtable_size: 64 * 1024 * 1024, manifest_scan_batch_size: 2, + // Short grace so the sweep is observable without a slow test. + frozen_memtable_grace: Duration::from_secs(1), ..Default::default() }; let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) @@ -4249,13 +4339,66 @@ mod tests { writer.force_seal_active().await.unwrap(); writer.wait_for_flush_drain().await.unwrap(); + // Recorded in the manifest at commit time. + let manifest = writer.manifest().await.unwrap().expect("manifest exists"); + assert!( + manifest + .flushed_generations + .iter() + .any(|g| g.generation == initial_gen), + "flushed generation must be recorded in the manifest" + ); + + // Still queryable in memory immediately after commit (within grace). + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert_eq!(refs.active.generation, initial_gen + 1); + assert!( + refs.frozen.iter().any(|f| f.generation == initial_gen), + "flushed generation must stay queryable during the grace window" + ); + + // After the grace elapses (plus a sweep tick) the handle is evicted. + tokio::time::sleep(Duration::from_millis(1_500)).await; let refs = writer.in_memory_memtable_refs().await.unwrap(); assert!( refs.frozen.is_empty(), - "frozen handle must be dropped once the flush commit lands" + "frozen handle must be swept once the grace elapses" ); - assert_eq!(refs.active.generation, initial_gen + 1); + writer.close().await.unwrap(); + } + + /// With zero grace (the default) a frozen handle is evicted synchronously on + /// flush commit — no sweep tick, no lingering window. + #[tokio::test] + async fn test_frozen_evicted_immediately_with_zero_grace() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: false, + sync_indexed_write: false, + max_wal_buffer_size: 64 * 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + frozen_memtable_grace: Duration::ZERO, + ..Default::default() + }; + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let initial_gen = writer.memtable_stats().await.unwrap().generation; + writer + .put(vec![create_test_batch(&schema, 0, 10)]) + .await + .unwrap(); + writer.force_seal_active().await.unwrap(); + writer.wait_for_flush_drain().await.unwrap(); + + // Rows are durably in the manifest... let manifest = writer.manifest().await.unwrap().expect("manifest exists"); assert!( manifest @@ -4265,6 +4408,13 @@ mod tests { "flushed generation must be recorded in the manifest" ); + // ...and the in-memory handle is already gone, no sweep tick needed. + let refs = writer.in_memory_memtable_refs().await.unwrap(); + assert!( + refs.frozen.is_empty(), + "frozen handle must be evicted on commit when grace is zero" + ); + writer.close().await.unwrap(); } From cfc3b6cc84f55d43f1387111c20c29e2964c3712 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Thu, 18 Jun 2026 22:37:43 +0800 Subject: [PATCH 142/177] perf(index): reuse WAND doc info (#7358) ## Performance Improvement This PR removes redundant `PostingIterator::doc()` calls in the WAND lead/tail scoring paths without changing `PostingIterator`, `HeadPosting`, heap ordering, public APIs, docs, or wire formats. The existing `HeadPosting` doc id cache remains unchanged. The refactor reuses already-read `DocInfo` values where the iterator position is known to be the same: - reuse the first lead posting's `DocInfo` in `Wand::next` for doc length, first lead score, and the returned candidate - call `posting.doc()` once in `advance_tail_top` after `posting.next(target)` and reuse it for target matching and scoring - apply the same single-`doc()` pattern per tail posting in `advance_all_tail` ## Benchmark Fresh GCP VM run with `search-benchmark` FTS static suite, `lance_fts`, `match`, `query_length=5`, `k=10`, `num_queries=1000`, `prewarm_index=true`, `with_position=false`, dataset `gs://fts-bench/yang-db/wikipedia-bench-v2-20260327.lance`. Search-benchmark commit: `8d05d5680f2fbac38c1642380dc98b8a2bb2140f`. Measured baseline/candidate SHAs from the pre-rebase benchmark branch: - baseline: `8743fbaf9ce0dce15373629a7574e14d5c6c9367` - candidate: `6ac7b2c061d34142838c340fb2dbda509506da63` Results from one run: - QPS: 274.768 -> 279.352 (+1.668%) - avg latency: 28.936 ms -> 28.495 ms (-1.523%) - p50: 21.703 ms -> 22.259 ms (+2.562% slower) - p90: 59.845 ms -> 59.059 ms (-1.313%) - p99: 131.076 ms -> 113.052 ms (-13.751%) The benchmark was run before rebasing this PR onto the latest `main`; the PR commit contains the same WAND code change rebased onto `c4e65645d75b17c572fee62809b013b3730370bb`. ## Validation On the rebased PR branch: - `cargo fmt --all --check` - `cargo test -p lance-index scalar::inverted::wand` - `cargo check -p lance-index --tests` - `cargo clippy --all --tests --benches -- -D warnings` --- rust/lance-index/src/scalar/inverted/wand.rs | 49 ++++++++------------ 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 259de6ee06f..dda0e69e7f8 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -976,41 +976,37 @@ impl<'a, S: Scorer> Wand<'a, S> { continue; } - let Some(doc) = self.lead.first().and_then(|posting| posting.doc()) else { + let Some(first_doc) = self.lead.first().and_then(|posting| posting.doc()) else { self.push_back_leads(target + 1); continue; }; - let doc_length = match &doc { + let doc_length = match &first_doc { DocInfo::Raw(doc) => self.docs.num_tokens(doc.doc_id), DocInfo::Located(doc) => self.docs.num_tokens_by_row_id(doc.row_id), }; - let mut lead_score = self - .lead - .iter() - .filter_map(|posting| { - posting.doc().map(|lead_doc| { - posting.score(&self.scorer, lead_doc.frequency(), doc_length) - }) - }) - .sum::(); + let mut lead_score = 0.0; + if let Some(first_posting) = self.lead.first() { + lead_score += first_posting.score(&self.scorer, first_doc.frequency(), doc_length); + } + for posting in self.lead.iter().skip(1) { + if let Some(lead_doc) = posting.doc() { + lead_score += posting.score(&self.scorer, lead_doc.frequency(), doc_length); + } + } while lead_score <= self.threshold { if lead_score + self.tail_max_score <= self.threshold { - self.push_back_leads(doc.doc_id() + 1); + self.push_back_leads(first_doc.doc_id() + 1); break; } if !self.advance_tail_top(target, doc_length, &mut lead_score) { - self.push_back_leads(doc.doc_id() + 1); + self.push_back_leads(first_doc.doc_id() + 1); break; } } if !self.lead.is_empty() { - return Ok(self - .lead - .first() - .and_then(|posting| posting.doc()) - .map(|doc| (doc, lead_score))); + return Ok(Some((first_doc, lead_score))); } } @@ -1401,10 +1397,9 @@ impl<'a, S: Scorer> Wand<'a, S> { }; self.tail_max_score -= upper_bound; posting.next(target); - match posting.doc().map(|doc| doc.doc_id()) { - Some(doc_id) if doc_id == target => { - let frequency = posting.doc().expect("posting must exist").frequency(); - *lead_score += posting.score(&self.scorer, frequency, doc_length); + match posting.doc() { + Some(doc) if doc.doc_id() == target => { + *lead_score += posting.score(&self.scorer, doc.frequency(), doc_length); self.lead.push(posting); } Some(_) => self.push_head(posting), @@ -1427,14 +1422,10 @@ impl<'a, S: Scorer> Wand<'a, S> { for tail_posting in tail.into_vec() { let mut posting = tail_posting.posting; posting.next(target); - match posting.doc().map(|doc| doc.doc_id()) { - Some(doc_id) if doc_id == target => { + match posting.doc() { + Some(doc) if doc.doc_id() == target => { if let (Some(doc_length), Some(score)) = (doc_length, score.as_deref_mut()) { - let frequency = posting - .doc() - .expect("posting moved to target should have doc") - .frequency(); - *score += posting.score(&self.scorer, frequency, doc_length); + *score += posting.score(&self.scorer, doc.frequency(), doc_length); } self.lead.push(posting) } From d0d8dad34099dff536e9549f267109f22eeb78d1 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Thu, 18 Jun 2026 11:03:32 -0500 Subject: [PATCH 143/177] feat(mem_wal): warm flushed generations into shared caches before query (#7284) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Evolve `FlushedMemTableCache` into the unified warm/open interface for mem_wal flushed generations, and populate the caches **before** a generation is queryable so the first query sees zero cold reads. - `FlushedMemTableCache` now owns a required `Session` (the index `CacheBackend` seam) and an optional read-through `WrappingObjectStore` (page cache), threading both into every open. `get_or_open(path)` drops its per-call session arg. - New `warm(path, pk_columns)`: open + `prewarm_all_indexes` (FTS) + `get_or_build_pk_hashes` (vector block-list), bounded by a semaphore and idempotent via a `warmed` gate. `open_flushed_dataset` fires a warm-on-open backstop. - `retain_paths` is now async and actively evicts retired generations' index objects via the new `Session::invalidate_index_prefix`; the byte cache is left to LRU. - `MemTableFlusher` warms each generation pre-commit, **best-effort** (logged on error, never blocks `update_manifest`), threaded via `ShardWriterConfig.flushed_cache`. This is the Lance-side building block for WAL-pod flushed-generation caching (consumed by sophon, which supplies the backed `Session` + read-through pool). ## Test plan - `cargo test -p lance --lib mem_wal::scanner::flushed_cache` (7 tests, incl. warm/idempotency/pk-hash/retain) — pass - `cargo test -p lance --lib mem_wal::memtable::flush` (8 tests) — pass - `cargo clippy -p lance --tests --benches` — clean - `cargo fmt --all` 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.8 (1M context) --- .../benches/mem_wal/write/mem_wal_write.rs | 1 + rust/lance/src/dataset/mem_wal/api.rs | 12 +- .../src/dataset/mem_wal/memtable/flush.rs | 107 +++++++++++++- rust/lance/src/dataset/mem_wal/scanner.rs | 2 +- .../src/dataset/mem_wal/scanner/block_list.rs | 10 +- .../src/dataset/mem_wal/scanner/builder.rs | 33 ++++- .../dataset/mem_wal/scanner/flushed_cache.rs | 130 ++++++++++++++++-- .../src/dataset/mem_wal/scanner/fts_search.rs | 25 +++- .../src/dataset/mem_wal/scanner/planner.rs | 25 +++- .../dataset/mem_wal/scanner/point_lookup.rs | 25 +++- .../dataset/mem_wal/scanner/vector_search.rs | 25 +++- rust/lance/src/dataset/mem_wal/write.rs | 18 ++- 12 files changed, 353 insertions(+), 60 deletions(-) diff --git a/rust/lance/benches/mem_wal/write/mem_wal_write.rs b/rust/lance/benches/mem_wal/write/mem_wal_write.rs index a92ec1f8847..9a5fc71ab17 100644 --- a/rust/lance/benches/mem_wal/write/mem_wal_write.rs +++ b/rust/lance/benches/mem_wal/write/mem_wal_write.rs @@ -652,6 +652,7 @@ fn bench_lance_memwal_write(c: &mut Criterion) { frozen_memtable_grace: default_config.frozen_memtable_grace, enable_memtable, hnsw_params: default_config.hnsw_params, + warmer: None, }; // Get writer through Dataset API (index configs loaded automatically) diff --git a/rust/lance/src/dataset/mem_wal/api.rs b/rust/lance/src/dataset/mem_wal/api.rs index b67f6434c9c..79184c13ec8 100644 --- a/rust/lance/src/dataset/mem_wal/api.rs +++ b/rust/lance/src/dataset/mem_wal/api.rs @@ -26,7 +26,7 @@ use crate::index::mem_wal::{load_mem_wal_index_details, new_mem_wal_index_meta}; use super::ShardWriterConfig; use super::scanner::flushed_cache::open_flushed_dataset; -use super::scanner::{FlushedMemTableCache, ShardSnapshot}; +use super::scanner::{DatasetCache, ShardSnapshot}; use super::write::MemIndexConfig; use super::write::ShardWriter; @@ -500,7 +500,7 @@ pub trait DatasetMemWalExt { async fn prewarm_mem_wal( &self, _snapshots: &[ShardSnapshot], - _cache: Option<&Arc>, + _cache: Option<&Arc>, ) -> Result<()> { Ok(()) } @@ -586,7 +586,7 @@ impl DatasetMemWalExt for Dataset { async fn prewarm_mem_wal( &self, snapshots: &[ShardSnapshot], - cache: Option<&Arc>, + cache: Option<&Arc>, ) -> Result<()> { let session = self.session(); // Resolve flushed paths exactly as the LSM collector does, so the @@ -601,7 +601,8 @@ impl DatasetMemWalExt for Dataset { snapshot.flushed_generations.iter().map(move |flushed| { let path = format!("{}/_mem_wal/{}/{}", base_path, shard_id, flushed.path); async move { - let dataset = open_flushed_dataset(&path, Some(session), cache).await?; + let dataset = + open_flushed_dataset(&path, Some(session), cache, None).await?; prewarm_all_indexes(&dataset).await } }) @@ -762,6 +763,7 @@ async fn load_vector_index_config( #[cfg(test)] mod tests { + use super::super::scanner::FlushedMemTableCache; use super::*; use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator}; @@ -831,7 +833,7 @@ mod tests { .with_current_generation(2) .with_flushed_generation(1, folder.to_string()); - let cache = Arc::new(FlushedMemTableCache::new(4)); + let cache: Arc = Arc::new(FlushedMemTableCache::new(4)); base.prewarm_mem_wal(std::slice::from_ref(&snapshot), Some(&cache)) .await .expect("prewarm must open the generation and warm its index"); diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index bf462b6e226..ebcc06cab44 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -18,7 +18,7 @@ use lance_io::object_store::ObjectStore; use lance_table::format::IndexMetadata; use lance_table::io::commit::write_manifest_file_to_path; use lance_table::io::deletion::write_deletion_file; -use log::info; +use log::{info, warn}; use object_store::ObjectStoreExt; use object_store::path::Path; use roaring::RoaringBitmap; @@ -29,6 +29,7 @@ use super::super::index::MemIndexConfig; use super::super::memtable::MemTable; use crate::Dataset; use crate::dataset::mem_wal::manifest::ShardManifestStore; +use crate::dataset::mem_wal::scanner::GenerationWarmer; use crate::dataset::mem_wal::scanner::exec::{compute_pk_hash, validate_pk_types}; use crate::dataset::mem_wal::util::{flushed_memtable_path, generate_random_hash}; @@ -68,6 +69,9 @@ pub struct MemTableFlusher { base_uri: String, shard_id: Uuid, manifest_store: Arc, + /// When present, each new generation is warmed before it is committed, so + /// the first query sees zero cold reads. `None` => no warming. + warmer: Option>, } impl MemTableFlusher { @@ -84,6 +88,26 @@ impl MemTableFlusher { base_uri: base_uri.into(), shard_id, manifest_store, + warmer: None, + } + } + + /// Attach the warmer fired pre-commit for each new generation. + pub fn with_warmer(mut self, warmer: Option>) -> Self { + self.warmer = warmer; + self + } + + /// Warm a just-written generation before it is committed. Best-effort: a + /// failure is logged and the flush proceeds — warming is never a commit + /// gate. No-op without a warmer. `uri` must be the resolved reader path + /// (`path_to_uri(gen_path)`) so warmed entries key-match later queries. + async fn warm_generation(&self, uri: &str) { + let Some(warmer) = &self.warmer else { + return; + }; + if let Err(e) = warmer.warm(uri).await { + warn!("pre-commit warm failed for generation {uri}; committing cold: {e}"); } } @@ -184,6 +208,10 @@ impl MemTableFlusher { // writes it on the indexed path.) No-op when the memtable has no PK. self.create_pk_index(&gen_path, memtable.indexes()).await?; + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -469,6 +497,10 @@ impl MemTableFlusher { self.write_bloom_filter(&bloom_path, memtable.bloom_filter()) .await?; + // Warm before commit (zero cold window); no-op without a warmer. + let warm_uri = self.path_to_uri(&gen_path); + self.warm_generation(&warm_uri).await; + let new_manifest = self .update_manifest( epoch, @@ -1201,6 +1233,79 @@ mod tests { assert_eq!(updated_manifest.flushed_generations.len(), 1); } + /// A `GenerationWarmer` that counts calls and optionally fails. + #[derive(Debug)] + struct CountingWarmer { + calls: Arc, + fail: bool, + } + + #[async_trait::async_trait] + impl GenerationWarmer for CountingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + if self.fail { + Err(Error::io("simulated warm failure".to_string())) + } else { + Ok(()) + } + } + } + + /// Warming is a best-effort optimization, never a commit gate: a warmer that + /// errors pre-commit must still let the flush commit the generation. The + /// warm fires exactly once on the pre-commit path. + #[tokio::test] + async fn test_flusher_commits_when_warm_fails() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + let schema = create_test_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![]).unwrap(); + let frag_id = memtable + .insert(create_test_batch(&schema, 10)) + .await + .unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let calls = Arc::new(std::sync::atomic::AtomicUsize::new(0)); + let warmer: Arc = Arc::new(CountingWarmer { + calls: calls.clone(), + fail: true, + }); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path, + base_uri, + shard_id, + manifest_store.clone(), + ) + .with_warmer(Some(warmer)); + // Flush must succeed despite the warmer erroring. + let result = flusher.flush(&memtable, epoch, 1).await.unwrap(); + + assert_eq!(result.generation.generation, 1); + assert_eq!( + calls.load(std::sync::atomic::Ordering::SeqCst), + 1, + "pre-commit warm fires exactly once" + ); + let updated = manifest_store.read_latest().await.unwrap().unwrap(); + assert_eq!( + updated.flushed_generations.len(), + 1, + "generation still committed after a failed warm" + ); + } + /// Flushing a generation with within-generation duplicate PKs writes a /// deletion vector so the flushed dataset exposes newest-per-PK on scan. #[tokio::test] diff --git a/rust/lance/src/dataset/mem_wal/scanner.rs b/rust/lance/src/dataset/mem_wal/scanner.rs index aaf915fb81f..fe14bd82dd8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner.rs @@ -51,7 +51,7 @@ pub use collector::{ pub use data_source::{ FlushedGeneration, FreshTierWatermark, LsmDataSource, LsmGeneration, ShardSnapshot, }; -pub use flushed_cache::FlushedMemTableCache; +pub use flushed_cache::{DatasetCache, FlushedMemTableCache, GenerationWarmer}; pub use fts_search::{LsmFtsSearchPlanner, SCORE_COLUMN}; pub use point_lookup::LsmPointLookupPlanner; pub use projection::DISTANCE_COLUMN; diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index f7f957845e4..69d16930888 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -32,7 +32,7 @@ use lance_index::scalar::{ use uuid::Uuid; use super::data_source::{FreshTierWatermark, LsmDataSource, LsmGeneration}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, open_flushed_dataset}; use crate::dataset::mem_wal::index::encode_pk_tuple; use crate::dataset::mem_wal::util::PK_INDEX_DIR; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; @@ -157,7 +157,7 @@ type ShardGenSets = HashMap>; pub async fn compute_source_block_lists( sources: &[LsmDataSource], session: Option<&Arc>, - flushed_cache: Option<&Arc>, + flushed_cache: Option<&Arc>, ) -> Result { // Membership per non-base source, grouped by shard (generations are // per-shard, so supersession is within-shard only). @@ -238,7 +238,7 @@ pub async fn compute_source_block_lists( pub async fn fresh_tier_block_list( sources: &[LsmDataSource], session: Option<&Arc>, - flushed_cache: Option<&Arc>, + flushed_cache: Option<&Arc>, watermarks: Option<&HashMap>, ) -> Result> { // Membership per source, in source order (`None` = skipped). Flushed @@ -379,9 +379,9 @@ fn path_cache_uuid(path: &str) -> Uuid { async fn open_pk_index( path: &str, session: Option<&Arc>, - flushed_cache: Option<&Arc>, + flushed_cache: Option<&Arc>, ) -> Result> { - let dataset = open_flushed_dataset(path, session, flushed_cache).await?; + let dataset = open_flushed_dataset(path, session, flushed_cache, None).await?; // Namespace the session index cache by the (immutable) flushed path so this // sidecar's pages live alongside every other index instead of a bespoke // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse. diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index e4940ebf706..a006257493b 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -21,7 +21,7 @@ use uuid::Uuid; use super::collector::{InMemoryMemTableRef, InMemoryMemTables, LsmDataSourceCollector}; use super::data_source::{FreshTierWatermark, ShardSnapshot}; -use super::flushed_cache::FlushedMemTableCache; +use super::flushed_cache::{DatasetCache, GenerationWarmer}; use super::planner::LsmScanPlanner; use super::point_lookup::LsmPointLookupPlanner; use crate::dataset::Dataset; @@ -124,7 +124,9 @@ pub struct LsmScanner { session: Option>, /// Cache of opened flushed-generation datasets. When set, repeated /// queries against the same generation skip the manifest read entirely. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Over-fetch multiple for block-listed sources in search plans /// (see [`super::LsmFtsSearchPlanner::with_overfetch_factor`]). overfetch_factor: Option, @@ -163,6 +165,7 @@ impl LsmScanner { pk_columns, session, flushed_cache: None, + warmer: None, overfetch_factor: None, } } @@ -202,6 +205,7 @@ impl LsmScanner { pk_columns, session: None, flushed_cache: None, + warmer: None, overfetch_factor: None, } } @@ -251,13 +255,21 @@ impl LsmScanner { /// /// With a cache, repeated queries against the same generation become a /// pure `Arc::clone` with no manifest read or object-store I/O. The cache - /// is owned and sized by the caller (see [`FlushedMemTableCache`]); not - /// set by default, so behavior is unchanged unless opted in. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + /// is owned and sized by the caller (any [`DatasetCache`] impl, e.g. + /// [`FlushedMemTableCache`](super::FlushedMemTableCache)); not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. Not set by + /// default, so behavior is unchanged unless opted in. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Set the over-fetch multiple block-listed sources use in search plans /// so they still yield `k` live rows after cross-generation dedup. /// Threaded into [`super::LsmFtsSearchPlanner`]; clamped to `>= 1.0`. @@ -367,6 +379,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } let plan = planner .plan_point_lookup(&keys, self.projection.as_deref()) .await?; @@ -383,6 +398,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } if let Some(factor) = self.overfetch_factor { planner = planner.with_overfetch_factor(factor); } @@ -421,6 +439,9 @@ impl LsmScanner { if let Some(cache) = &self.flushed_cache { planner = planner.with_flushed_cache(cache.clone()); } + if let Some(warmer) = &self.warmer { + planner = planner.with_warmer(warmer.clone()); + } if let Some(factor) = self.overfetch_factor { planner = planner.with_overfetch_factor(factor); } @@ -473,7 +494,7 @@ impl LsmScanner { /// the primary-key columns; the returned `Vec` is aligned with its /// rows. Hashing matches the scanner's internal dedup, so the caller never /// hashes PKs itself. Flushed membership comes from the injected - /// [`FlushedMemTableCache`] when one is set. + /// [`DatasetCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { self.contains_pks_at(pks, None).await } diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs index 0c2d3b039fe..7a5280bedb8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs @@ -22,6 +22,7 @@ use std::collections::HashSet; use std::sync::Arc; +use async_trait::async_trait; use lance_core::{Error, Result}; use crate::dataset::{Dataset, DatasetBuilder}; @@ -115,29 +116,92 @@ impl std::fmt::Debug for FlushedMemTableCache { } } +/// Caching of opened flushed-generation datasets, keyed by immutable path. The +/// opened dataset carries the session index cache, which also backs each +/// generation's secondary indexes and its PK dedup sidecar (see +/// `block_list::open_pk_index`) — so a single `get_or_open` is the +/// whole caching surface. Implemented by [`FlushedMemTableCache`]; a +/// [`GenerationWarmer`] composes one to warm through it, and a consumer may +/// supply its own implementation. +#[async_trait] +pub trait DatasetCache: Send + Sync + std::fmt::Debug { + async fn get_or_open(&self, path: &str, session: Option>) -> Result>; + + /// Drop cached entries whose path is not in `live_paths`. Async so an + /// implementation can evict retired generations' index objects (e.g. + /// `Session::invalidate_index_prefix`) without a later breaking signature + /// change; [`FlushedMemTableCache`]'s own eviction is synchronous. + async fn retain_paths(&self, live_paths: &HashSet); +} + +#[async_trait] +impl DatasetCache for FlushedMemTableCache { + async fn get_or_open(&self, path: &str, session: Option>) -> Result> { + Self::get_or_open(self, path, session).await + } + + async fn retain_paths(&self, live_paths: &HashSet) { + Self::retain_paths(self, live_paths) + } +} + +/// Proactively warms a flushed generation into the shared caches: open the +/// dataset and pre-load its secondary indexes and PK dedup sidecar so the first +/// query sees no cold reads. This is the **seam** the flush and read paths fire +/// — lance defines it; the consumer (e.g. the WAL pod) implements it. `None` => +/// no warming, generations warm lazily on first read. +/// +/// Everything a warmer touches is keyed by the immutable generation `path` +/// (opened dataset, its secondary indexes, its PK dedup sidecar), so `path` is +/// the only input it needs. +/// +/// `warm` is fired fire-and-forget from every read path that opens a generation +/// (all four LSM planners) as well as pre-commit on flush, so the same path may +/// be warmed concurrently and repeatedly. Implementations **must be idempotent +/// and cheap when the path is already warm** (e.g. dedup in-flight and +/// completed paths) — a redundant call must not re-do work or fail. +#[async_trait] +pub trait GenerationWarmer: Send + Sync + std::fmt::Debug { + async fn warm(&self, path: &str) -> Result<()>; +} + /// Open a flushed-generation dataset, shared by all three LSM open sites /// (scan, point lookup, vector search). /// -/// - `cache` present: route through [`FlushedMemTableCache`] (single-flight, -/// shared `Arc`, manifest read amortized across queries). +/// - `cache` present: route through a [`DatasetCache`] (e.g. +/// [`FlushedMemTableCache`]: single-flight, shared `Arc`, manifest read +/// amortized across queries). /// - `cache` absent: cold open via [`DatasetBuilder`]. Passing `session` /// still reuses the shared index / metadata caches; `None`/`None` /// reproduces the original per-query cold-open behavior exactly. +/// - `warmer` present: fire a fire-and-forget warm-on-open backstop behind the +/// returned handle (the warmer dedups already-warm paths). `None` => no warming. pub async fn open_flushed_dataset( path: &str, session: Option<&Arc>, - cache: Option<&Arc>, + cache: Option<&Arc>, + warmer: Option<&Arc>, ) -> Result> { - match cache { - Some(cache) => cache.get_or_open(path, session.cloned()).await, + let dataset = match cache { + Some(cache) => cache.get_or_open(path, session.cloned()).await?, None => { let mut builder = DatasetBuilder::from_uri(path); if let Some(session) = session { builder = builder.with_session(session.clone()); } - Ok(Arc::new(builder.load().await?)) + Arc::new(builder.load().await?) } + }; + if let Some(warmer) = warmer { + let warmer = Arc::clone(warmer); + let path = path.to_string(); + tokio::spawn(async move { + if let Err(e) = warmer.warm(&path).await { + tracing::debug!(generation = %path, error = %e, "warm-on-open failed"); + } + }); } + Ok(dataset) } #[cfg(test)] @@ -257,8 +321,8 @@ mod tests { let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); write_dataset(&uri, &[7, 8, 9]).await; - let a = open_flushed_dataset(&uri, None, None).await.unwrap(); - let b = open_flushed_dataset(&uri, None, None).await.unwrap(); + let a = open_flushed_dataset(&uri, None, None, None).await.unwrap(); + let b = open_flushed_dataset(&uri, None, None, None).await.unwrap(); assert!( !Arc::ptr_eq(&a, &b), "no-cache path must cold-open each call" @@ -266,13 +330,57 @@ mod tests { assert_eq!(a.count_rows(None).await.unwrap(), 3); // With a cache, the second call is a shared clone. - let cache = Arc::new(FlushedMemTableCache::new(8)); - let c = open_flushed_dataset(&uri, None, Some(&cache)) + let cache: Arc = Arc::new(FlushedMemTableCache::new(8)); + let c = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); - let d = open_flushed_dataset(&uri, None, Some(&cache)) + let d = open_flushed_dataset(&uri, None, Some(&cache), None) .await .unwrap(); assert!(Arc::ptr_eq(&c, &d), "cached path must reuse the Arc"); } + + /// A warmer that records calls and signals each one. + #[derive(Debug)] + struct NotifyingWarmer { + calls: Arc, + notify: Arc, + } + + #[async_trait] + impl GenerationWarmer for NotifyingWarmer { + async fn warm(&self, _path: &str) -> Result<()> { + self.calls.fetch_add(1, Ordering::SeqCst); + self.notify.notify_one(); + Ok(()) + } + } + + #[tokio::test] + async fn test_open_flushed_dataset_fires_warm_on_open() { + // The warm-on-open backstop fires the warmer (fire-and-forget) when a + // generation is opened, so generations the flusher never warmed still + // get warmed lazily on first read. + let temp_dir = tempfile::tempdir().unwrap(); + let uri = format!("{}/gen_1", temp_dir.path().to_str().unwrap()); + write_dataset(&uri, &[1, 2, 3]).await; + + let calls = Arc::new(AtomicUsize::new(0)); + let notify = Arc::new(tokio::sync::Notify::new()); + let warmer: Arc = Arc::new(NotifyingWarmer { + calls: calls.clone(), + notify: notify.clone(), + }); + + let ds = open_flushed_dataset(&uri, None, None, Some(&warmer)) + .await + .unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 3); + + // The warm is spawned fire-and-forget; wait (bounded) for it to run. + tokio::time::timeout(std::time::Duration::from_secs(5), notify.notified()) + .await + .expect("warm-on-open must fire"); + assert_eq!(calls.load(Ordering::SeqCst), 1, "warmer fired once on open"); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index 92298524535..e7c8d205d5d 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -53,7 +53,7 @@ use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; use super::exec::{NewestPkFilterExec, PkBlockFilterExec}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; use crate::session::Session; @@ -76,7 +76,9 @@ pub struct LsmFtsSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Over-fetch multiple for blocked sources (clamped to `>= 1.0`). overfetch_factor: f64, } @@ -94,6 +96,7 @@ impl LsmFtsSearchPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, overfetch_factor: DEFAULT_OVERFETCH_FACTOR, } } @@ -114,11 +117,17 @@ impl LsmFtsSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Build the FTS execution plan (local scoring). /// /// # Arguments @@ -286,9 +295,13 @@ impl LsmFtsSearchPlanner { scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index 8b74f9efd79..f040428f342 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -16,7 +16,7 @@ use tracing::instrument; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, }; @@ -33,7 +33,9 @@ pub struct LsmScanPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Over-fetch multiple for the per-source limit pushdown: block-listed /// sources scan `(offset + limit) * factor` rows so cross-gen dedup drops /// still leave enough live rows. Clamped to `>= 1.0`. @@ -53,6 +55,7 @@ impl LsmScanPlanner { base_schema, session: None, flushed_cache: None, + warmer: None, overfetch_factor: 1.0, } } @@ -66,11 +69,17 @@ impl LsmScanPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// queries against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Set the over-fetch multiple for the per-source limit pushdown /// (see the field docs). Clamped to `>= 1.0` at use. pub fn with_overfetch_factor(mut self, factor: f64) -> Self { @@ -304,9 +313,13 @@ impl LsmScanPlanner { scanner.create_plan().await } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index 3902eb04589..2da4b5cd9a6 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -31,7 +31,7 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars}; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_address, wants_row_id, @@ -87,7 +87,9 @@ pub struct LsmPointLookupPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, /// Precomputed canonical output schema for the no-projection case, so the /// hot `lookup(.., None)` path clones an `Arc` instead of rebuilding the /// schema on every call. @@ -120,6 +122,7 @@ impl LsmPointLookupPlanner { bloom_filters: std::collections::HashMap::new(), session: None, flushed_cache: None, + warmer: None, none_target, task_ctx: SessionContext::new().task_ctx(), } @@ -137,11 +140,17 @@ impl LsmPointLookupPlanner { /// front during scan setup via /// [`DatasetMemWalExt::prewarm_mem_wal`](crate::dataset::mem_wal::DatasetMemWalExt::prewarm_mem_wal) /// so the first gen-key lookup does not pay the dataset open. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Add a bloom filter for a generation. /// /// Bloom filters are optional but improve performance by skipping @@ -546,9 +555,13 @@ impl LsmPointLookupPlanner { scanner.create_plan().await? } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>())?; scanner.filter_expr(filter.clone()); diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index 71e0674aa79..7f849f3d8bf 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -27,7 +27,7 @@ use crate::io::exec::TakeExec; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; +use super::flushed_cache::{DatasetCache, GenerationWarmer, open_flushed_dataset}; use super::projection::{ DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, wants_row_id, @@ -93,7 +93,9 @@ pub struct LsmVectorSearchPlanner { /// Session threaded into flushed-generation opens (shared caches). session: Option>, /// Cache of opened flushed-generation datasets. - flushed_cache: Option>, + flushed_cache: Option>, + /// Optional warmer fired on first open of a flushed generation. + warmer: Option>, } impl LsmVectorSearchPlanner { @@ -122,6 +124,7 @@ impl LsmVectorSearchPlanner { dataset: None, session: None, flushed_cache: None, + warmer: None, } } @@ -134,11 +137,17 @@ impl LsmVectorSearchPlanner { /// Inject a cache of opened flushed-generation datasets, making repeated /// searches against the same generation a pure `Arc::clone`. - pub fn with_flushed_cache(mut self, cache: Arc) -> Self { + pub fn with_flushed_cache(mut self, cache: Arc) -> Self { self.flushed_cache = Some(cache); self } + /// Inject the warmer fired on first open of a flushed generation. + pub fn with_warmer(mut self, warmer: Arc) -> Self { + self.warmer = Some(warmer); + self + } + /// Set the base dataset for post-rerank take. /// /// After global PK dedup and sort, a `TakeExec` against this dataset @@ -447,9 +456,13 @@ impl LsmVectorSearchPlanner { Ok((scanner.create_plan().await?, None)) } LsmDataSource::FlushedMemTable { path, .. } => { - let dataset = - open_flushed_dataset(path, self.session.as_ref(), self.flushed_cache.as_ref()) - .await?; + let dataset = open_flushed_dataset( + path, + self.session.as_ref(), + self.flushed_cache.as_ref(), + self.warmer.as_ref(), + ) + .await?; let mut scanner = dataset.scan(); let cols = build_scanner_projection(projection, &self.base_schema, &self.pk_columns); diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 227ba2e19d3..491bb68aec5 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -47,6 +47,7 @@ pub use super::util::{WatchableOnceCell, WatchableOnceCellReader}; pub use super::wal::{WalEntry, WalEntryData, WalFlushResult, WalFlusher}; use super::memtable::flush::TriggerMemTableFlush; +use super::scanner::GenerationWarmer; use super::wal::{ TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, empty_flush_result, }; @@ -231,6 +232,11 @@ pub struct ShardWriterConfig { /// /// Default: empty. pub hnsw_params: HashMap, + + /// Optional warmer fired pre-commit for each new generation (zero cold reads + /// on first query). Wired to the flusher; supplied by the consumer (e.g. the + /// WAL pod). Default: `None`. + pub warmer: Option>, } impl Default for ShardWriterConfig { @@ -254,6 +260,7 @@ impl Default for ShardWriterConfig { frozen_memtable_grace: Duration::ZERO, enable_memtable: true, hnsw_params: HashMap::new(), + warmer: None, } } } @@ -1414,13 +1421,10 @@ impl ShardWriter { let (memtable_flush_tx, memtable_flush_rx) = mpsc::unbounded_channel(); - let flusher = Arc::new(MemTableFlusher::new( - object_store, - base_path, - base_uri, - shard_id, - manifest_store, - )); + let flusher = Arc::new( + MemTableFlusher::new(object_store, base_path, base_uri, shard_id, manifest_store) + .with_warmer(config.warmer.clone()), + ); let backpressure = BackpressureController::new(config.clone()); From 5fd0659a49e4d080290438da7acd80f4e6b0f1d8 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Thu, 18 Jun 2026 16:06:12 +0000 Subject: [PATCH 144/177] chore: release beta version 8.0.0-beta.19 --- .bumpversion.toml | 2 +- Cargo.lock | 52 ++++++++++++++++++------------------- Cargo.toml | 44 +++++++++++++++---------------- java/lance-jni/Cargo.lock | 54 +++++++++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 44 +++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 106 insertions(+), 96 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index aa1657223fc..fc6364e2e1e 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.18" +current_version = "8.0.0-beta.19" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 165548bb4a0..d08341ddfe9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1248,9 +1248,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4380,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "all_asserts", "approx", @@ -4483,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4531,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrayref", "paste", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4580,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4613,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4632,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "proc-macro2", "quote", @@ -4641,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -4686,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "all_asserts", "arrow", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "datafusion", "geo-traits", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "approx", "arc-swap", @@ -4842,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-arith", @@ -4890,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "approx", "arrow-array", @@ -4909,7 +4909,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "async-trait", @@ -4921,7 +4921,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-schema", @@ -4937,7 +4937,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -5001,7 +5001,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -5019,7 +5019,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -5065,7 +5065,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "proc-macro2", "quote", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-schema", @@ -5087,7 +5087,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5100,7 +5100,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index e07965db278..39a5bc597bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.18", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.18", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.18", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.18", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.18", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.18", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.18", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.18", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.18", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.18", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.18", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.18", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.18", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.18", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.0.0-beta.19", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.0.0-beta.19", path = "./rust/lance-arrow" } +lance-core = { version = "=8.0.0-beta.19", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.0.0-beta.19", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.0.0-beta.19", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.0.0-beta.19", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.0.0-beta.19", path = "./rust/lance-encoding" } +lance-file = { version = "=8.0.0-beta.19", path = "./rust/lance-file" } +lance-geo = { version = "=8.0.0-beta.19", path = "./rust/lance-geo" } +lance-index = { version = "=8.0.0-beta.19", path = "./rust/lance-index" } +lance-io = { version = "=8.0.0-beta.19", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.0.0-beta.19", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.0.0-beta.19", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.0.0-beta.19", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.18", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.18", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.18", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.18", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.18", path = "./rust/lance-testing" } +lance-select = { version = "=8.0.0-beta.19", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.0.0-beta.19", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.0.0-beta.19", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.0.0-beta.19", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.0.0-beta.19", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.18", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.0.0-beta.19", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.18", path = "./rust/compression/fsst" } +fsst = { version = "=8.0.0-beta.19", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 01ee724f59e..1a5978d5782 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -1029,9 +1029,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3665,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arc-swap", "arrow", @@ -3738,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -3780,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrayref", "paste", @@ -3789,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -3859,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -3876,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "proc-macro2", "quote", @@ -3885,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -3920,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -3950,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "datafusion", "geo-traits", @@ -3964,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-arith", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4124,7 +4124,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "async-trait", @@ -4136,7 +4136,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-ipc", @@ -4185,7 +4185,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4237,11 +4237,12 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "icu_segmenter", "rust-stemmers", "serde", + "stop-words", "unicode-normalization", ] @@ -6767,6 +6768,15 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" +[[package]] +name = "stop-words" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68df56303396bcfb639455b3c166804aeb7994005010aab5e9e8a1277b8871d" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.11.1" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index b17df027736..4df708de9d7 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index bfd82436e81..33b4d45bf01 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.18 + 8.0.0-beta.19 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index c3319f5cba6..6acf34203ab 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1209,9 +1209,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" [[package]] name = "bytes-utils" @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4067,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arc-swap", "arrow", @@ -4141,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4183,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrayref", "paste", @@ -4192,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "proc-macro2", "quote", @@ -4288,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-arith", "arrow-array", @@ -4353,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "datafusion", "geo-traits", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arc-swap", "arrow", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-arith", @@ -4477,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4492,7 +4492,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "async-trait", @@ -4504,7 +4504,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-ipc", @@ -4553,7 +4553,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow-array", "arrow-buffer", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "arrow", "arrow-array", @@ -4607,7 +4607,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6045,7 +6045,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index d43bbcf2001..12b14cd82a2 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.18" +version = "8.0.0-beta.19" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 28781899bbaeacc2eec4e231368ad4718d0836ef Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Fri, 19 Jun 2026 00:37:31 +0800 Subject: [PATCH 145/177] feat: allow tuning miniblock value chunks to 32k (#7356) This allows miniblock writers to use up to 32K logical values per chunk when explicitly configured via `LANCE_MINIBLOCK_MAX_VALUES`, while keeping the default at 4096. The file format already stores `log_num_values` in 4 bits, so the writer-side guard can allow values up to 15 without requiring the large-chunk metadata path. The compressed byte-size limits remain enforced. Fixes #7326. --- docs/src/format/file/encoding.md | 7 +- .../src/encodings/logical/primitive.rs | 81 ++++++++++++++----- .../encodings/logical/primitive/miniblock.rs | 26 ++++-- 3 files changed, 86 insertions(+), 28 deletions(-) diff --git a/docs/src/format/file/encoding.md b/docs/src/format/file/encoding.md index a3d99ef39cb..4ca053d4fa6 100644 --- a/docs/src/format/file/encoding.md +++ b/docs/src/format/file/encoding.md @@ -683,9 +683,10 @@ the default mini-block size is negligible. You should only consider changing thi confirmed — through profiling — that mini-block read amplification is saturating your available bandwidth (for example, accessing a remote object store over a constrained network link). -The maximum number of values per mini-block can be lowered via an environment variable: +The maximum number of values per mini-block can be tuned via an environment variable: -- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`): upper bound on the number of values in a single mini-block chunk. +- `LANCE_MINIBLOCK_MAX_VALUES` (default `4096`, maximum `32768`): upper bound on the number of values in a single mini-block chunk. Reducing this value produces smaller mini-blocks, which reduces the amount of data fetched per read at the -cost of more mini-blocks and slightly more metadata overhead. +cost of more mini-blocks and slightly more metadata overhead. Increasing it can reduce metadata overhead and +improve throughput for highly compressible data, but it may increase random-read amplification. diff --git a/rust/lance-encoding/src/encodings/logical/primitive.rs b/rust/lance-encoding/src/encodings/logical/primitive.rs index 064e3b59745..9b506359e55 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive.rs @@ -3701,12 +3701,7 @@ struct SerializedFullZip { // // If we directly record the size in bytes with 12 bits we would be limited to // 4KiB which is too small. Since we know each mini-block consists of 8 byte -// words we can store the # of words instead which gives us 32KiB. We want -// at least 24KiB so we can handle even the worst case of -// - 4Ki values compressed into an 8186 byte buffer -// - 4 bytes to describe rep & def lengths -// - 16KiB of rep & def buffer (this will almost never happen but life is easier if we -// plan for it) +// words we can store the # of words instead which gives us 32KiB. // // Second, each chunk in a mini-block is aligned to 8 bytes. This allows multi-byte // values like offsets to be stored in a mini-block and safely read back out. It also @@ -3906,9 +3901,9 @@ impl PrimitiveStructuralEncoder { // 0xA) All blocks except the last must have power-of-two number of values. // This not only makes metadata smaller but it makes decoding easier since // batch sizes are typically a power of 2. 4 bits would allow us to express - // up to 16Ki values but we restrict this further to 4Ki values. + // up to 32Ki values. // - // This means blocks can have 1 to 4Ki values and 8 - 32Ki bytes. + // This means blocks can have 1 to 32Ki values and 8 - 32Ki bytes. // // All metadata words are serialized (as little endian) into a single buffer // of metadata values. @@ -4007,7 +4002,13 @@ impl PrimitiveStructuralEncoder { } } else { for &buffer_size in &chunk.buffer_sizes { - data_buffer.extend_from_slice(&(buffer_size as u16).to_le_bytes()); + let buffer_size = u16::try_from(buffer_size).map_err(|_| { + Error::internal(format!( + "Mini-block buffer size ({} bytes) too large for 16-bit metadata", + buffer_size + )) + })?; + data_buffer.extend_from_slice(&buffer_size.to_le_bytes()); } } @@ -4041,15 +4042,28 @@ impl PrimitiveStructuralEncoder { let chunk_bytes = data_buffer.len() - start_pos; let max_chunk_size = if support_large_chunk { - 4 * 1024 * 1024 * 1024 // 4GB limit with u32 metadata + 1_u64 << 31 // 28 bits of 8-byte words in u32 metadata } else { 32 * 1024 // 32KiB limit with u16 metadata }; - assert!(chunk_bytes <= max_chunk_size); - assert!(chunk_bytes > 0); - assert_eq!(chunk_bytes % 8, 0); - // 4Ki values max - assert!(chunk.log_num_values <= 12); + if chunk_bytes == 0 || chunk_bytes as u64 > max_chunk_size { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes exceeds the {} byte metadata limit", + chunk_bytes, max_chunk_size + ))); + } + if chunk_bytes % MINIBLOCK_ALIGNMENT != 0 { + return Err(Error::internal(format!( + "Mini-block chunk size {} bytes is not aligned to {} bytes", + chunk_bytes, MINIBLOCK_ALIGNMENT + ))); + } + if chunk.log_num_values > 15 { + return Err(Error::internal(format!( + "Mini-block log_num_values {} exceeds the 4-bit metadata limit", + chunk.log_num_values + ))); + } // We subtract 1 here from chunk_bytes because we want to be able to express // a size of 32KiB and not (32Ki - 8)B which is what we'd get otherwise with // 0xFFF @@ -5768,8 +5782,9 @@ mod tests { use super::{ ChunkInstructions, DataBlock, DecodeMiniBlockTask, FixedPerValueDecompressor, FixedWidthDataBlock, FullZipCacheableState, FullZipDecodeDetails, FullZipReadSource, - FullZipRepIndexDetails, FullZipScheduler, MiniBlockRepIndex, PerValueDecompressor, - PreambleAction, StructuralPageScheduler, VariableFullZipDecoder, + FullZipRepIndexDetails, FullZipScheduler, MiniBlockChunk, MiniBlockCompressed, + MiniBlockRepIndex, PerValueDecompressor, PreambleAction, StructuralPageScheduler, + VariableFullZipDecoder, }; use crate::buffer::LanceBuffer; use crate::compression::DefaultDecompressionStrategy; @@ -6967,7 +6982,7 @@ mod tests { #[tokio::test] async fn test_binary_large_minichunk_size_over_max_miniblock_values() { let mut string_data = Vec::new(); - // 128kb/chunk / 6 bytes (t_9999) = 21845 > max 4096 items per chunk + // 128kb/chunk / 6 bytes (t_9999) = 21845 items per chunk for i in 0..10000 { string_data.push(Some(format!("t_{}", i))); } @@ -7566,6 +7581,36 @@ mod tests { ); } + #[test] + fn test_v2_1_miniblock_serializes_log_num_values_15() { + let miniblocks = MiniBlockCompressed { + data: vec![LanceBuffer::from(vec![1_u8; 16])], + chunks: vec![ + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 15, + }, + MiniBlockChunk { + buffer_sizes: vec![8], + log_num_values: 0, + }, + ], + num_values: 32_769, + }; + + let serialized = + PrimitiveStructuralEncoder::serialize_miniblocks(miniblocks, None, None, false) + .unwrap(); + + let chunk_metadata = serialized.metadata.borrow_to_typed_slice::(); + assert_eq!(chunk_metadata.len(), 2); + assert_eq!( + chunk_metadata[0] & 0x0F, + 15, + "V2.1 metadata should use all 4 bits for log_num_values" + ); + } + async fn encode_first_page( field: arrow_schema::Field, array: ArrayRef, diff --git a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs index de3227b2a39..1cf3b9bf581 100644 --- a/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs +++ b/rust/lance-encoding/src/encodings/logical/primitive/miniblock.rs @@ -19,13 +19,14 @@ use lance_core::Result; pub const MAX_MINIBLOCK_BYTES: u64 = 8 * 1024 - 6; const DEFAULT_MAX_MINIBLOCK_VALUES: u64 = 4096; +const MAX_CONFIGURABLE_MINIBLOCK_VALUES: u64 = 32768; fn parse_max_miniblock_values() -> u64 { let val = std::env::var("LANCE_MINIBLOCK_MAX_VALUES") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(DEFAULT_MAX_MINIBLOCK_VALUES); - val.clamp(1, DEFAULT_MAX_MINIBLOCK_VALUES) + val.clamp(1, MAX_CONFIGURABLE_MINIBLOCK_VALUES) } pub static MAX_MINIBLOCK_VALUES: std::sync::LazyLock = @@ -58,9 +59,9 @@ pub struct MiniBlockCompressed { /// and contain a power-of-two number of values (except for the last chunk) /// /// By default we limit a chunk to 4Ki values and slightly less than -/// 8KiB of compressed data. This means that even in the extreme case -/// where we have 4 bytes of rep/def then we will have at most 24KiB of -/// data (values, repetition, and definition) per mini-block. +/// 8KiB of compressed value data. The byte budget remains the primary +/// constraint, so only encodings that compress many values into that +/// budget can use larger value counts when explicitly configured. /// /// The maximum number of values per chunk can be configured via the /// `LANCE_MINIBLOCK_MAX_VALUES` environment variable. This is only @@ -77,8 +78,8 @@ pub struct MiniBlockChunk { // then this should be 0 (the number of values will be calculated by subtracting the // size of all other chunks from the total size of the page) // - // For example, 1 would mean there are 2 values in the chunk and 12 would mean there - // are 4Ki values in the chunk. + // For example, 1 would mean there are 2 values in the chunk and 15 would mean there + // are 32Ki values in the chunk. // // This must be <= log2(MAX_MINIBLOCK_VALUES) (i.e. <= 12 at the default of 4096) pub log_num_values: u8, @@ -135,6 +136,14 @@ mod tests { unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } + #[test] + #[serial] + fn test_parse_can_raise_to_32k() { + unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "32768") }; + assert_eq!(parse_max_miniblock_values(), 32768); + unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; + } + #[test] #[serial] fn test_parse_clamps_zero_to_one() { @@ -147,7 +156,10 @@ mod tests { #[serial] fn test_parse_clamps_above_max() { unsafe { std::env::set_var("LANCE_MINIBLOCK_MAX_VALUES", "99999") }; - assert_eq!(parse_max_miniblock_values(), DEFAULT_MAX_MINIBLOCK_VALUES); + assert_eq!( + parse_max_miniblock_values(), + MAX_CONFIGURABLE_MINIBLOCK_VALUES + ); unsafe { std::env::remove_var("LANCE_MINIBLOCK_MAX_VALUES") }; } From b2c26ea295a1bdcc03636ff78e7ef8b253b5ac6e Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Fri, 19 Jun 2026 00:54:54 +0800 Subject: [PATCH 146/177] perf(fts): defer term frequency collection (#7357) ## Performance Improvement ### What is the performance issue or bottleneck? WAND search eagerly collected term frequencies for each scored candidate before checking whether that candidate would enter the top-k heap. Candidates rejected by the current kth score still paid for a `Vec` allocation and `PostingIterator::doc()` calls. ### How does this PR improve performance? This moves term-frequency collection into the two branches that actually insert or replace a top-k candidate, for both WAND search and flat search. Rejected candidates now avoid collecting frequencies entirely. The PR also adds focused regression tests that count test-only term-frequency collection calls and verify rejected equal-score candidates do not collect frequencies. ### Benchmark `wikipedia-40m-fts`, V2 `text_idx`, `match`, `query_length=5`, `stop_words=0`, `k=100`, `num_queries=1000`, `seed=0`. | build | QPS | avg | p50 | p90 | p99 | |---|---:|---:|---:|---:|---:| | before `838f78b9` | 127.46 | 62.56ms | 57.20ms | 113.27ms | 223.31ms | | after `4c05e1c` | 130.34 | 61.17ms | 54.65ms | 112.63ms | 219.22ms | | delta | +2.26% | -2.22% | -4.46% | -0.56% | -1.83% | Raw artifacts: - suite: `/mnt/benchmark-ssd/search-benchmark/bench_suites/wikipedia_fts_wand_freqs_compare_k100_20260618.json` - log: `/mnt/benchmark-ssd/logs/wikipedia_fts_wand_freqs_compare_k100_v2_20260618_091950.log` - csv: `/mnt/benchmark-ssd/search-benchmark/results/results_fts_static_20260618_093407.csv` ### Validation - `cargo fmt --all` - `CARGO_TARGET_DIR=/tmp/lance-target-21fd-pr-clippy cargo clippy --all --tests --benches -- -D warnings` - `CARGO_TARGET_DIR=/tmp/lance-target-21fd-pr-clippy cargo test -p lance-index scalar::inverted::wand::tests::` - `git diff --check` - `/Users/yang/.cache/uv/archive-v0/CK_YxmMYMk7DlRLAQr3It/bin/python -mpre_commit run --files rust/lance-index/src/scalar/inverted/wand.rs -v` --- rust/lance-index/src/scalar/inverted/wand.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index dda0e69e7f8..212038ba432 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -776,14 +776,15 @@ impl<'a, S: Scorer> Wand<'a, S> { self.score(doc_length) }; - let freqs = self.iter_term_freqs().collect(); if candidates.len() < limit { + let freqs = self.iter_term_freqs().collect(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { + let freqs = self.iter_term_freqs().collect(); candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); let kth = candidates.peek().unwrap().0.0.score.0; @@ -894,15 +895,16 @@ impl<'a, S: Scorer> Wand<'a, S> { self.collect_tail_matches(doc_id); let score = self.score(doc_length); - let freqs = self.iter_term_freqs().collect(); if candidates.len() < limit { + let freqs = self.iter_term_freqs().collect(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } } else if score > candidates.peek().unwrap().0.0.score.0 { + let freqs = self.iter_term_freqs().collect(); candidates.pop(); candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); let kth = candidates.peek().unwrap().0.0.score.0; From cdcb18afee6e800969a99e0527f138492f66afb4 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 18 Jun 2026 10:23:15 -0700 Subject: [PATCH 147/177] fix: stream FM index partition builds (#7354) Streams FM index training and update input into partition-sized writes instead of materializing the full training stream first. This keeps large segmented FM builds bounded by one partition plus the current Arrow batch while preserving the existing partition file layout. --- rust/lance-index/src/scalar/fmindex.rs | 236 +++++++++++++++++++++---- 1 file changed, 197 insertions(+), 39 deletions(-) diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index aed1136535a..cdf19f0304c 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -1376,8 +1376,7 @@ impl ScalarIndex for FMIndexScalarIndex { dest: &dyn IndexStore, _old_data_filter: Option, ) -> Result { - let texts = collect_texts(new_data).await?; - let files = write_partitioned_fmindex(&texts, dest).await?; + let files = write_partitioned_fmindex_stream(new_data, dest).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, @@ -1396,8 +1395,14 @@ impl ScalarIndex for FMIndexScalarIndex { // ── Helpers ────────────────────────────────────────────────────────────────── -async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result)>> { - let mut texts = Vec::new(); +async fn write_partitioned_fmindex_stream( + mut stream: SendableRecordBatchStream, + store: &dyn IndexStore, +) -> Result> { + let mut files = Vec::new(); + let mut partition = Vec::with_capacity(PARTITION_SIZE); + let mut partition_id = 0; + while let Some(batch) = stream.next().await { let batch = batch?; // Prefer _rowaddr (global row address) over _rowid to ensure stable, @@ -1415,24 +1420,85 @@ async fn collect_texts(mut stream: SendableRecordBatchStream) -> Result = bytes - .iter() - .map(|&b| { - if b == SENTINEL_BYTE || b == 0x00 { - b' ' - } else { - b - } - }) - .collect(); - texts.push((rid, sanitized)); + if let Some(bytes) = extract_sanitized_text_bytes(value_col.as_ref(), i)? { + partition.push((rid, bytes)); + if partition.len() == PARTITION_SIZE { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + partition.clear(); + partition_id += 1; + } } } } - Ok(texts) + + if !partition.is_empty() { + files.push(write_fmindex_partition(&partition, store, partition_id).await?); + } else if files.is_empty() { + files.push(write_empty_fmindex_partition(store).await?); + } + + Ok(files) +} + +fn sanitize_text_bytes(bytes: &[u8]) -> Vec { + bytes + .iter() + .map(|&b| { + if b == SENTINEL_BYTE || b == 0x00 { + b' ' + } else { + b + } + }) + .collect() } +fn extract_sanitized_text_bytes( + array: &dyn arrow_array::Array, + index: usize, +) -> Result>> { + if array.is_null(index) { + return Ok(None); + } + match array.data_type() { + DataType::Utf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::LargeUtf8 => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index) + .as_bytes(), + ))), + DataType::Binary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + DataType::LargeBinary => Ok(Some(sanitize_text_bytes( + array + .as_any() + .downcast_ref::() + .unwrap() + .value(index), + ))), + _ => Err(Error::invalid_input(format!( + "Fm does not support data type: {:?}", + array.data_type() + ))), + } +} + +#[cfg(test)] fn extract_text_bytes(array: &dyn arrow_array::Array, index: usize) -> Result>> { if array.is_null(index) { return Ok(None); @@ -1574,25 +1640,36 @@ async fn write_fmindex(fm: &FMIndex, store: &dyn IndexStore, filename: &str) -> writer.finish_with_metadata(metadata).await } +#[cfg(test)] async fn write_partitioned_fmindex( texts: &[(u64, Vec)], store: &dyn IndexStore, ) -> Result> { - let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); - if refs.is_empty() { - let fm = FMIndex::build(&[])?; - return Ok(vec![ - write_fmindex(&fm, store, &fmindex_partition_path(0)).await?, - ]); + if texts.is_empty() { + return Ok(vec![write_empty_fmindex_partition(store).await?]); } let mut files = Vec::new(); - for (pid, chunk) in refs.chunks(PARTITION_SIZE).enumerate() { - let fm = FMIndex::build(chunk)?; - files.push(write_fmindex(&fm, store, &fmindex_partition_path(pid as u64)).await?); + for (pid, chunk) in texts.chunks(PARTITION_SIZE).enumerate() { + files.push(write_fmindex_partition(chunk, store, pid as u64).await?); } Ok(files) } +async fn write_fmindex_partition( + texts: &[(u64, Vec)], + store: &dyn IndexStore, + partition_id: u64, +) -> Result { + let refs: Vec<(u64, &[u8])> = texts.iter().map(|(id, t)| (*id, t.as_slice())).collect(); + let fm = FMIndex::build(&refs)?; + write_fmindex(&fm, store, &fmindex_partition_path(partition_id)).await +} + +async fn write_empty_fmindex_partition(store: &dyn IndexStore) -> Result { + let fm = FMIndex::build(&[])?; + write_fmindex(&fm, store, &fmindex_partition_path(0)).await +} + // ── Plugin ─────────────────────────────────────────────────────────────────── #[derive(Debug, Default)] @@ -1629,8 +1706,7 @@ impl ScalarIndexPlugin for FMIndexPlugin { _fids: Option>, _progress: Arc, ) -> Result { - let texts = collect_texts(data).await?; - let files = write_partitioned_fmindex(&texts, store).await?; + let files = write_partitioned_fmindex_stream(data, store).await?; Ok(CreatedIndex { index_details: prost_types::Any::from_msg(&pb::FmIndexIndexDetails {}).unwrap(), index_version: FMINDEX_INDEX_VERSION, @@ -1681,7 +1757,10 @@ impl ScalarIndexPlugin for FMIndexPlugin { #[cfg(test)] mod tests { use super::*; - use lance_core::cache::LanceCache; + use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray, UInt64Array}; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use futures::stream; + use lance_core::{ROW_ADDR, cache::LanceCache}; use lance_io::object_store::ObjectStore; use object_store::path::Path; use std::sync::Arc; @@ -1894,11 +1973,10 @@ mod tests { #[test] fn test_sentinel_sanitization() { - // Text containing \xFF should be sanitized to space + // Text containing \xFF should be sanitized to space during training. let texts: Vec<(u64, &[u8])> = vec![(0, b"hello\xFFworld")]; let fm = FMIndex::build(&texts).unwrap(); - // The \xFF is replaced with space during collect_texts, but here we test build directly - // which doesn't sanitize. The search should still work. + // Build itself does not sanitize, but search should still work. let r = fm.search(b"hello"); assert!(r.contains(0)); } @@ -2070,11 +2148,6 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_plugin_train_and_load() { - use arrow_array::{StringArray, UInt64Array}; - use datafusion::physical_plan::stream::RecordBatchStreamAdapter; - use futures::stream; - use lance_core::ROW_ADDR; - let docs = vec!["hello world", "hello rust", "goodbye world"]; let row_addrs: Vec = vec![0, 1, 2]; let schema = Arc::new(arrow_schema::Schema::new(vec![ @@ -2137,6 +2210,88 @@ mod tests { } } + #[tokio::test(flavor = "multi_thread")] + async fn test_plugin_train_streams_multiple_partitions() { + fn training_batch( + schema: Arc, + start: usize, + len: usize, + ) -> RecordBatch { + let docs = vec!["x"; len]; + let row_addrs: Vec = (start..start + len).map(|i| i as u64).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(docs)), + Arc::new(UInt64Array::from(row_addrs)), + ], + ) + .unwrap() + } + + let total_rows = PARTITION_SIZE + 5; + let first_batch_rows = PARTITION_SIZE - 3; + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new( + crate::scalar::registry::VALUE_COLUMN_NAME, + DataType::Utf8, + false, + ), + arrow_schema::Field::new(ROW_ADDR, DataType::UInt64, false), + ])); + let batches = vec![ + Ok(training_batch(schema.clone(), 0, first_batch_rows)), + Ok(training_batch( + schema.clone(), + first_batch_rows, + total_rows - first_batch_rows, + )), + ]; + + let tempdir = tempfile::tempdir().unwrap(); + let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap(); + let store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + index_dir, + Arc::new(LanceCache::no_cache()), + )); + + let stream = RecordBatchStreamAdapter::new(schema, stream::iter(batches)); + let req = FMIndexPlugin + .new_training_request("", &arrow_schema::Field::new("val", DataType::Utf8, false)) + .unwrap(); + let created = FMIndexPlugin + .train_index( + Box::pin(stream), + store.as_ref(), + req, + None, + Arc::new(crate::progress::NoopIndexBuildProgress), + ) + .await + .unwrap(); + + assert_eq!(created.files.len(), 2); + + let index = FMIndexPlugin + .load_index(store, &created.index_details, None, &LanceCache::no_cache()) + .await + .unwrap(); + let r = index + .search( + &TextQuery::StringContains("x".to_string()), + &crate::metrics::NoOpMetricsCollector, + ) + .await + .unwrap(); + match r { + SearchResult::Exact(set) => { + assert_eq!(set.len(), Some(total_rows as u64)); + } + _ => panic!("expected exact result"), + } + } + #[test] fn test_build_wavelet_batch() { let texts: Vec<(u64, &[u8])> = vec![(0, b"hello world"), (1, b"test data")]; @@ -2148,8 +2303,6 @@ mod tests { #[test] fn test_extract_text_bytes_types() { - use arrow_array::{BinaryArray, LargeBinaryArray, LargeStringArray, StringArray}; - let utf8 = StringArray::from(vec!["hello"]); assert_eq!( extract_text_bytes(&utf8, 0).unwrap(), @@ -2167,6 +2320,11 @@ mod tests { extract_text_bytes(&binary, 0).unwrap(), Some(b"bytes".to_vec()) ); + let binary_with_sentinels = BinaryArray::from(vec![b"a\xFFb\0c" as &[u8]]); + assert_eq!( + extract_sanitized_text_bytes(&binary_with_sentinels, 0).unwrap(), + Some(b"a b c".to_vec()) + ); let large_binary = LargeBinaryArray::from(vec![b"large" as &[u8]]); assert_eq!( From 5651ff5c058a61235cae17392afed671af310553 Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Thu, 18 Jun 2026 20:04:02 +0000 Subject: [PATCH 148/177] chore: bump main to 8.1.0-beta.0 Unreleased version after creating v8.0.0-rc.1 --- .bumpversion.toml | 2 +- Cargo.lock | 52 +++++++++++++++++++-------------------- Cargo.toml | 44 ++++++++++++++++----------------- java/lance-jni/Cargo.lock | 44 ++++++++++++++++----------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 44 ++++++++++++++++----------------- python/Cargo.toml | 2 +- 8 files changed, 96 insertions(+), 96 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index fc6364e2e1e..80668862afb 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.0.0-beta.19" +current_version = "8.1.0-beta.0" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index d08341ddfe9..89d20bdf647 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1119,9 +1119,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4380,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "all_asserts", "approx", @@ -4483,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4531,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4580,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4613,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4632,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", @@ -4641,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4686,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "all_asserts", "arrow", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "approx", "arc-swap", @@ -4842,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4890,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "approx", "arrow-array", @@ -4909,7 +4909,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4921,7 +4921,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-schema", @@ -4937,7 +4937,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -5001,7 +5001,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -5019,7 +5019,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -5065,7 +5065,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", @@ -5074,7 +5074,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-schema", @@ -5087,7 +5087,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5100,7 +5100,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 39a5bc597bd..f902f10496b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.0.0-beta.19", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.0.0-beta.19", path = "./rust/lance-arrow" } -lance-core = { version = "=8.0.0-beta.19", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.0.0-beta.19", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.0.0-beta.19", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.0.0-beta.19", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.0.0-beta.19", path = "./rust/lance-encoding" } -lance-file = { version = "=8.0.0-beta.19", path = "./rust/lance-file" } -lance-geo = { version = "=8.0.0-beta.19", path = "./rust/lance-geo" } -lance-index = { version = "=8.0.0-beta.19", path = "./rust/lance-index" } -lance-io = { version = "=8.0.0-beta.19", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.0.0-beta.19", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.0.0-beta.19", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.0.0-beta.19", path = "./rust/lance-namespace-impls" } +lance = { version = "=8.1.0-beta.0", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=8.1.0-beta.0", path = "./rust/lance-arrow" } +lance-core = { version = "=8.1.0-beta.0", path = "./rust/lance-core" } +lance-datafusion = { version = "=8.1.0-beta.0", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=8.1.0-beta.0", path = "./rust/lance-datagen" } +lance-derive = { version = "=8.1.0-beta.0", path = "./rust/lance-derive" } +lance-encoding = { version = "=8.1.0-beta.0", path = "./rust/lance-encoding" } +lance-file = { version = "=8.1.0-beta.0", path = "./rust/lance-file" } +lance-geo = { version = "=8.1.0-beta.0", path = "./rust/lance-geo" } +lance-index = { version = "=8.1.0-beta.0", path = "./rust/lance-index" } +lance-io = { version = "=8.1.0-beta.0", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=8.1.0-beta.0", path = "./rust/lance-linalg" } +lance-namespace = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.0.0-beta.19", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.0.0-beta.19", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.0.0-beta.19", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.0.0-beta.19", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.0.0-beta.19", path = "./rust/lance-testing" } +lance-select = { version = "=8.1.0-beta.0", path = "./rust/lance-select" } +lance-tokenizer = { version = "=8.1.0-beta.0", path = "./rust/lance-tokenizer" } +lance-table = { version = "=8.1.0-beta.0", path = "./rust/lance-table" } +lance-test-macros = { version = "=8.1.0-beta.0", path = "./rust/lance-test-macros" } +lance-testing = { version = "=8.1.0-beta.0", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.0.0-beta.19", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=8.1.0-beta.0", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.0.0-beta.19", path = "./rust/compression/fsst" } +fsst = { version = "=8.1.0-beta.0", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 1a5978d5782..ee52544ba57 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -929,9 +929,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3665,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -3738,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3780,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -3789,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -3859,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -3876,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", @@ -3885,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -3920,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -3950,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -3964,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4124,7 +4124,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4136,7 +4136,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-ipc", @@ -4185,7 +4185,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4200,7 +4200,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4237,7 +4237,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 4df708de9d7..6210c5daf1d 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 33b4d45bf01..6306ecc63f9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.0.0-beta.19 + 8.1.0-beta.0 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 6acf34203ab..01d2edda1c8 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -1059,9 +1059,9 @@ dependencies = [ [[package]] name = "bitvec" -version = "1.0.1" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +checksum = "ddcec3d12c579d40898fe0a9a358a803c23e9c52ca3c425707f81c9436211837" dependencies = [ "funty", "radium", @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4067,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4141,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4183,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrayref", "paste", @@ -4192,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "proc-macro2", "quote", @@ -4288,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-arith", "arrow-array", @@ -4353,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "datafusion", "geo-traits", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arc-swap", "arrow", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-arith", @@ -4477,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4492,7 +4492,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "async-trait", @@ -4504,7 +4504,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-ipc", @@ -4553,7 +4553,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow-array", "arrow-buffer", @@ -4568,7 +4568,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "arrow", "arrow-array", @@ -4607,7 +4607,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6045,7 +6045,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index 12b14cd82a2..c90aa34a23b 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.0.0-beta.19" +version = "8.1.0-beta.0" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 062ee31c01b00f0f72a12e69f34d961e42924428 Mon Sep 17 00:00:00 2001 From: ZTorChan <52128037+ztorchan@users.noreply.github.com> Date: Fri, 19 Jun 2026 05:44:02 +0800 Subject: [PATCH 149/177] fix: route JSON index queries to the correct sub-parser by path (#7072) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem When a dataset has a JSON column and multiple JSON indices are created on different JSON paths of that same column (e.g. one index on `$.a` and another on `$.b`), query routing is incorrect. A query like `json_extract(json, '$.b') = 'foo'` may hit the `$.a` index instead of the `$.b` index, producing wrong results. ## Root Cause `maybe_indexed_column` obtains a parser from `IndexInformationProvider::get_index()`, which returns a `&dyn ScalarQueryParser` pointing to a `MultiQueryParser` that aggregates all sub-parsers for that column. The flow was: 1. `get_index()` returns `MultiQueryParser` as `&dyn ScalarQueryParser` 2. `parser.is_valid_reference(expr, data_type)` is called — `MultiQueryParser`'s impl iterates children and returns `Some(DataType)` from the **first** child that accepts, but discards **which** child matched 3. The same `MultiQueryParser` is then used for `visit_eq` / `visit_between` etc., which also iterate children and return the first non-`None` result — potentially a **different** child than the one that validated the reference This means the query can be dispatched to the wrong JSON index (e.g. the `$.a` index for a `$.b` query). ## Fix - **Change `IndexInformationProvider::get_index`** to return `(&DataType, &MultiQueryParser)` instead of `(&DataType, &dyn ScalarQueryParser)`, so callers can interact with the `MultiQueryParser` directly - **Add `MultiQueryParser::select(expr, data_type)`** — iterates child parsers and returns `(&dyn ScalarQueryParser, DataType)` from the first child whose `is_valid_reference` accepts the expression, preserving **which** child matched - **Update `maybe_indexed_column`** to call `multi.select(expr, data_type)` instead of `parser.is_valid_reference(expr, data_type)`, obtaining the precise sub-parser for all subsequent operations ## Test Added regression test `test_multi_json_indices_route_by_path` that: - Creates a `MultiQueryParser` with two `JsonQueryParser` sub-parsers (for `$.a` and `$.b`) - Verifies `json_extract(json, '$.b') = 'foo'` resolves to the `json_b_idx` index - Verifies `json_extract(json, '$.a') = 'foo'` resolves to the `json_a_idx` index - Verifies `json_extract(json, '$.c') = 'foo'` (unindexed path) does not bind to any index --- rust/lance-index/src/scalar/expression.rs | 118 ++++++++++++++++++---- rust/lance/src/index.rs | 8 +- 2 files changed, 104 insertions(+), 22 deletions(-) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index ea7fbabc813..053da5ae5e7 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -179,6 +179,18 @@ impl MultiQueryParser { pub fn add(&mut self, other: Box) { self.parsers.push(other); } + + /// Pick the first underlying parser whose `is_valid_reference` accepts `expr`. + pub fn select( + &self, + expr: &Expr, + data_type: &DataType, + ) -> Option<(&dyn ScalarQueryParser, DataType)> { + self.parsers.iter().find_map(|p| { + p.is_valid_reference(expr, data_type) + .map(|dt| (p.as_ref(), dt)) + }) + } } impl ScalarQueryParser for MultiQueryParser { @@ -1585,8 +1597,8 @@ fn maybe_indexed_column<'b>( ) -> Option<(String, DataType, &'b dyn ScalarQueryParser)> { // First try to extract the full nested column path for get_field expressions if let Some(nested_path) = extract_nested_column_path(expr) - && let Some((data_type, parser)) = index_info.get_index(&nested_path) - && let Some(data_type) = parser.is_valid_reference(expr, data_type) + && let Some((data_type, multi)) = index_info.get_index(&nested_path) + && let Some((parser, data_type)) = multi.select(expr, data_type) { return Some((nested_path, data_type, parser)); } @@ -1594,12 +1606,9 @@ fn maybe_indexed_column<'b>( match expr { Expr::Column(col) => { let col = col.name.as_str(); - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } Expr::ScalarFunction(udf) => { if udf.args.is_empty() { @@ -1607,12 +1616,9 @@ fn maybe_indexed_column<'b>( } // For non-get_field functions, fall back to old behavior let col = maybe_column(&udf.args[0])?; - let (data_type, parser) = index_info.get_index(col)?; - if let Some(data_type) = parser.is_valid_reference(expr, data_type) { - Some((col.to_string(), data_type, parser)) - } else { - None - } + let (data_type, multi) = index_info.get_index(col)?; + let (parser, data_type) = multi.select(expr, data_type)?; + Some((col.to_string(), data_type, parser)) } _ => None, } @@ -1977,7 +1983,7 @@ fn visit_node( pub trait IndexInformationProvider { /// Check if an index exists for `col` and, if so, return the data type of col /// as well as a query parser that can parse queries for that column - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)>; + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)>; /// The set of fragments covered by `(column, index_name)`. /// @@ -2159,11 +2165,18 @@ mod tests { struct ColInfo { data_type: DataType, - parser: Box, + parser: Box, } impl ColInfo { fn new(data_type: DataType, parser: Box) -> Self { + Self { + data_type, + parser: Box::new(MultiQueryParser::single(parser)), + } + } + + fn with_multi(data_type: DataType, parser: Box) -> Self { Self { data_type, parser } } } @@ -2185,7 +2198,7 @@ mod tests { } impl IndexInformationProvider for MockIndexInfoProvider { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) .map(|col_info| (&col_info.data_type, col_info.parser.as_ref())) @@ -3354,4 +3367,75 @@ mod tests { assert_eq!(round_tripped.upper, RowAddrMask::from_allowed(upper_addrs)); assert_eq!(round_tripped_frags, fragments_covered); } + + /// Regression test: when two JSON indices target different paths on the same + /// column, a query against one path must be routed to its own index instead + /// of being intercepted by whichever parser was registered first. + #[test] + fn test_multi_json_indices_route_by_path() { + // Build a MultiQueryParser containing two JSON sub-parsers: one for + // path "$.a" and one for path "$.b". + let mut multi = MultiQueryParser::single(Box::new(JsonQueryParser::new( + "$.a".to_string(), + Box::new(SargableQueryParser::new( + "json_a_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + multi.add(Box::new(JsonQueryParser::new( + "$.b".to_string(), + Box::new(SargableQueryParser::new( + "json_b_idx".to_string(), + "Json".to_string(), + false, + )), + ))); + + let index_info = MockIndexInfoProvider::new(vec![( + "json", + ColInfo::with_multi(DataType::LargeBinary, Box::new(multi)), + )]); + + // Query against path "$.b" must hit the "$.b" index. + let expected_b = IndexedExpression::index_query( + "json".to_string(), + "json_b_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.b".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.b') = 'foo'", + Some(expected_b), + false, + ); + + // Query against path "$.a" must hit the "$.a" index. + let expected_a = IndexedExpression::index_query( + "json".to_string(), + "json_a_idx".to_string(), + "Json".to_string(), + Arc::new(JsonQuery::new( + Arc::new(SargableQuery::Equals(ScalarValue::Utf8(Some( + "foo".to_string(), + )))), + "$.a".to_string(), + )), + ); + check( + &index_info, + "json_extract(json, '$.a') = 'foo'", + Some(expected_a), + false, + ); + + // Query against an unindexed path must not bind to either index. + check_no_index(&index_info, "json_extract(json, '$.c') = 'foo'"); + } } diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 6a61e6fb0d6..1a3a3aa54ec 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -29,9 +29,7 @@ use lance_index::mem_wal::{MEM_WAL_INDEX_NAME, MemWalIndex}; use lance_index::optimize::OptimizeOptions; use lance_index::pb::index::Implementation; pub use lance_index::progress::{IndexBuildProgress, NoopIndexBuildProgress}; -use lance_index::scalar::expression::{ - IndexInformationProvider, MultiQueryParser, ScalarQueryParser, -}; +use lance_index::scalar::expression::{IndexInformationProvider, MultiQueryParser}; use lance_index::scalar::inverted::{InvertedIndex, InvertedIndexPlugin}; use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::registry::{TrainingCriteria, TrainingOrdering}; @@ -660,10 +658,10 @@ pub struct ScalarIndexInfo { } impl IndexInformationProvider for ScalarIndexInfo { - fn get_index(&self, col: &str) -> Option<(&DataType, &dyn ScalarQueryParser)> { + fn get_index(&self, col: &str) -> Option<(&DataType, &MultiQueryParser)> { self.indexed_columns .get(col) - .map(|(ty, parser)| (ty, parser.as_ref() as &dyn ScalarQueryParser)) + .map(|(ty, parser)| (ty, parser.as_ref())) } fn fragment_bitmap(&self, column: &str, index_name: &str) -> Option { From 245f927f91298073e670b7adf1d990a0c90e75f1 Mon Sep 17 00:00:00 2001 From: Haocheng Liu <30446009+HaochengLIU@users.noreply.github.com> Date: Thu, 18 Jun 2026 17:53:13 -0400 Subject: [PATCH 150/177] refactor(index): rely on total ordering for nan zonemap max (#7049) ## Summary - remove the dedicated finite_value_may_be_in_zone helper - rely on ScalarValue total ordering for finite values against NaN zonemap max values - add a focused assertion covering finite targets below a stored NaN max ## Tests - cargo test -p lance-index scalar::zonemap::tests::test_nan_zonemap_index -- --nocapture - cargo test -p lance dataset::scanner::test::test_inexact_scalar_index_plans -- --nocapture --- rust/lance-index/src/scalar/zonemap.rs | 46 ++++++++++++++------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 9f2228740c2..8e7e20c211a 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -151,26 +151,11 @@ impl ZoneMapIndex { Self::zone_has_finite_min(zone) && !(zone.max.is_null() || Self::scalar_is_nan(&zone.max)) } - fn finite_value_may_be_in_zone(value: &ScalarValue, zone: &ZoneMapStatistics) -> bool { - if !Self::zone_has_finite_min(zone) || value < &zone.min { - return false; - } - - if Self::scalar_is_nan(&zone.max) { - // A NaN max means this zone had both NaNs and finite values. The - // finite max is not persisted, so keep the zone as a false positive - // instead of using total ordering to prune it. - return true; - } - - !zone.max.is_null() && value <= &zone.max - } - /// Evaluates whether a zone could potentially contain values matching the query. /// - /// NaN query values use the explicit `nan_count`. When the stored max is - /// NaN we do not treat it as a finite upper bound; that representation means - /// the zone had finite values plus NaNs, and the finite max was not persisted. + /// NaN query values use the explicit `nan_count`. For finite query values, + /// `ScalarValue` total ordering keeps finite values below a stored NaN max, + /// so zones with finite values plus NaNs remain conservative false positives. fn evaluate_zone_against_query( &self, zone: &ZoneMapStatistics, @@ -206,7 +191,7 @@ impl ZoneMapIndex { return Ok(false); } - Ok(Self::finite_value_may_be_in_zone(target, zone)) + Ok(target >= &zone.min && target <= &zone.max) } SargableQuery::Range(start, end) => { // Zone overlaps with query range if there's any intersection between @@ -336,22 +321,28 @@ impl ZoneMapIndex { ScalarValue::Float16(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } ScalarValue::Float32(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } ScalarValue::Float64(Some(f)) => { if f.is_nan() { zone.nan_count > 0 + } else if !Self::zone_has_finite_min(zone) { + false } else { - Self::finite_value_may_be_in_zone(value, zone) + value >= &zone.min && value <= &zone.max } } _ => { @@ -1438,6 +1429,17 @@ mod tests { ); } + let zone = &index.zones[0]; + assert!(matches!( + zone.max, + ScalarValue::Float32(Some(value)) if value.is_nan() + )); + let finite_target = ScalarValue::Float32(Some(1000.0)); + assert!( + finite_target >= zone.min && finite_target <= zone.max, + "ScalarValue total ordering keeps finite values below NaN max" + ); + // Test search for NaN values using Equals with NaN let query = SargableQuery::Equals(ScalarValue::Float32(Some(f32::NAN))); let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); From fb17c93eadb2f32f91bf37bcf3acc99e8ffc0ca1 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 18 Jun 2026 16:10:33 -0700 Subject: [PATCH 151/177] ci(java): parallelize macOS build, split publish; scope release lock updates (#7341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Java publish workflow built the macOS native lib *and* published the multi-platform JAR in a single job, so the macOS build was gated behind the Linux builds (`needs: [linux-arm64, linux-x86]`) even though only the publish step depends on the Linux artifacts. ## Workflow restructure (`java-publish.yml`) - **`build-linux`** — the two near-identical Debian 10 builds collapsed into a matrix (x86-64, arm64). - **`build-macos`** — build-only, no `needs`, runs concurrently with the Linux builds. - **`publish`** — a separate `ubuntu-latest` job that gathers all three native libs and packages/deploys with `-Dskip.build.jni=true`. No longer holds the expensive macOS runner during publish. Critical path goes from `linux → (macos build + publish)` to `max(linux, macos) → publish`. ## Drop no-op `-P shade-jar` That profile doesn't exist; shading is an always-on plugin bound to the `package` phase. The flag only produced a `could not be activated` warning. ## `cargo update --workspace` in release scripts The release scripts ran bare `cargo update` after the version bump, which refreshed *all* transitive dependencies rather than just the local crates being re-versioned. This swept incidental dependency bumps into release commits — see [this run](https://github.com/lance-format/lance/actions/runs/26624032632/job/78456375994#step:5:2496), where the lock update pulled in `brotli`, `hyper`, `jiff`, `zerocopy`, and ~14 others on top of the intended `lance-*` version changes. `--workspace` re-pins only the workspace crates whose versions changed. Applied to all three release scripts (`create_release_branch.sh`, `release_common.sh`, `publish_beta.sh`) for consistency. > [!NOTE] > The PR's `pull_request` trigger exercises the full build + dry-run path. The `cargo update --workspace` change only takes effect on the next release-tooling run. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 (1M context) --- .github/workflows/java-publish.yml | 139 ++++++++++------------------- ci/create_release_branch.sh | 12 +-- ci/publish_beta.sh | 12 +-- ci/release_common.sh | 6 +- 4 files changed, 62 insertions(+), 107 deletions(-) diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml index a51cf969a87..2b22b60dc92 100644 --- a/.github/workflows/java-publish.yml +++ b/.github/workflows/java-publish.yml @@ -28,10 +28,24 @@ permissions: contents: read jobs: - linux-arm64: - name: Build on Linux Arm64 - runs-on: ubuntu-24.04-arm64-8x + build-linux: + name: Build on Linux ${{ matrix.arch }} + runs-on: ${{ matrix.runner }} timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - arch: x86-64 + runner: ubuntu-24.04 + docker_platform: linux/amd64 + protoc_arch: x86_64 + artifact: liblance_jni_linux_x86_64.zip + - arch: arm64 + runner: ubuntu-24.04-arm64-8x + docker_platform: linux/arm64 + protoc_arch: aarch_64 + artifact: liblance_jni_linux_arm_64.zip steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 @@ -41,9 +55,9 @@ jobs: uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - name: Check glibc version outside docker run: ldd --version - - name: Build and run in Debian 10 Arm64 container + - name: Build and run in Debian 10 container run: | - docker run --platform linux/arm64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " + docker run --platform ${{ matrix.docker_platform }} -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " set -ex # Update sources.list to use archive repositories for Debian 10 (EOL) @@ -81,7 +95,7 @@ jobs: unzip # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-aarch_64.zip + PROTOC_ZIP=protoc-3.15.0-linux-${{ matrix.protoc_arch }}.zip curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP unzip -o \$PROTOC_ZIP -d /usr/local rm -f \$PROTOC_ZIP @@ -102,101 +116,44 @@ jobs: " - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_arm_64.zip + name: ${{ matrix.artifact }} path: java/lance-jni/target/release/liblance_jni.so retention-days: 1 if-no-files-found: error - linux-x86: - name: Build on Linux x86-64 - runs-on: ubuntu-24.04 + build-macos: + name: Build on MacOS Arm64 + runs-on: warp-macos-14-arm64-6x timeout-minutes: 60 steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 - - name: Check glibc version outside docker - run: ldd --version - - name: Build and run in Debian 10 X86-64 container - run: | - docker run --platform linux/amd64 -v ${{ github.workspace }}:/workspace -w /workspace debian:10 bash -c " - - set -ex - # Update sources.list to use archive repositories for Debian 10 (EOL) - echo 'deb http://archive.debian.org/debian/ buster main' > /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian-security buster/updates main' >> /etc/apt/sources.list - echo 'deb http://archive.debian.org/debian/ buster-updates main' >> /etc/apt/sources.list - apt-get update - - DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends --assume-yes \ - apt-transport-https \ - ca-certificates \ - curl \ - gpg \ - bash \ - less \ - openssl \ - libssl-dev \ - pkg-config \ - libsqlite3-dev \ - libsqlite3-0 \ - libreadline-dev \ - git \ - cmake \ - dh-autoreconf \ - clang \ - g++ \ - libc++-dev \ - libc++abi-dev \ - libprotobuf-dev \ - libncurses5-dev \ - libncursesw5-dev \ - libudev-dev \ - libhidapi-dev \ - zip \ - unzip - - # https://github.com/databendlabs/databend/issues/8035 - PROTOC_ZIP=protoc-3.15.0-linux-x86_64.zip - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.15.0/\$PROTOC_ZIP - unzip -o \$PROTOC_ZIP -d /usr/local - rm -f \$PROTOC_ZIP - protoc --version - - curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain stable - source \$HOME/.cargo/env - cargo --version - - cd java/lance-jni - - # https://github.com/rustls/rustls/issues/1967 - export CC=clang - export CXX=clang++ - ldd --version - - cargo build --release - " + - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 + - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 + - name: Install dependencies + run: brew install protobuf + - name: Build native lib + working-directory: java/lance-jni + run: cargo build --release - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: - name: liblance_jni_linux_x86_64.zip - path: java/lance-jni/target/release/liblance_jni.so + name: liblance_jni_darwin_aarch64.zip + path: java/lance-jni/target/release/liblance_jni.dylib retention-days: 1 if-no-files-found: error - macos-arm64: - name: Build on MacOS Arm64 and release - runs-on: warp-macos-14-arm64-6x - timeout-minutes: 60 + publish: + name: Publish Java packages + runs-on: ubuntu-latest + timeout-minutes: 30 needs: - - linux-arm64 - - linux-x86 + - build-linux + - build-macos steps: - name: Checkout repository uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: ref: ${{ inputs.ref || github.ref }} - - uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 - name: Set up Java 11 uses: actions/setup-java@c1e323688fd81a25caa38c78aa6df2d33d3e20d9 # v4 with: @@ -208,18 +165,16 @@ jobs: server-password: SONATYPE_TOKEN gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }} - - uses: Homebrew/actions/setup-homebrew@50b8c2ab4a835c38897ed2c56c293b07167c0b59 # master 2026-03-07 - - name: Install dependencies - run: | - brew install protobuf - brew install gpg - - name: Download artifact + - name: Download artifacts uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4 - name: Copy native libs run: | - mkdir -p ./java/target/classes/nativelib/linux-x86-64 ./java/target/classes/nativelib/linux-aarch64 + mkdir -p ./java/target/classes/nativelib/linux-x86-64 \ + ./java/target/classes/nativelib/linux-aarch64 \ + ./java/target/classes/nativelib/darwin-aarch64 cp ./liblance_jni_linux_x86_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-x86-64/liblance_jni.so cp ./liblance_jni_linux_arm_64.zip/liblance_jni.so ./java/target/classes/nativelib/linux-aarch64/liblance_jni.so + cp ./liblance_jni_darwin_aarch64.zip/liblance_jni.dylib ./java/target/classes/nativelib/darwin-aarch64/liblance_jni.dylib - name: Set github run: | git config --global user.email "Lance Github Runner" @@ -230,7 +185,7 @@ jobs: inputs.mode == 'dry_run' working-directory: java run: | - mvn --batch-mode -DskipTests -Drust.release.build=true package + mvn --batch-mode -DskipTests -Dskip.build.jni=true package - name: Publish with Java 11 if: | github.event_name == 'release' || @@ -240,14 +195,14 @@ jobs: echo "use-agent" >> ~/.gnupg/gpg.conf echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf export GPG_TTY=$(tty) - mvn --batch-mode -DskipTests -Drust.release.build=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh -P shade-jar + mvn --batch-mode -DskipTests -Dskip.build.jni=true -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh env: SONATYPE_USER: ${{ secrets.SONATYPE_USER }} SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }} report-failure: name: Report Workflow Failure runs-on: ubuntu-latest - needs: [linux-arm64, linux-x86, macos-arm64] + needs: [build-linux, build-macos, publish] if: always() && (github.event_name == 'release' || github.event_name == 'workflow_dispatch') permissions: contents: read diff --git a/ci/create_release_branch.sh b/ci/create_release_branch.sh index 9c7d9d3e58a..db88f5b6b24 100755 --- a/ci/create_release_branch.sh +++ b/ci/create_release_branch.sh @@ -229,9 +229,9 @@ else bump-my-version bump -vv --new-version "${RC_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) # Commit the RC version git add -A @@ -259,9 +259,9 @@ else bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump main to ${NEXT_VERSION} diff --git a/ci/publish_beta.sh b/ci/publish_beta.sh index f50798a52e0..06fa5c16a91 100644 --- a/ci/publish_beta.sh +++ b/ci/publish_beta.sh @@ -93,9 +93,9 @@ if [[ "${BRANCH}" == "main" ]] && [[ "${CURRENT_VERSION}" =~ -beta\.[0-9]+$ ]]; bump-my-version bump -vv --new-version "${NEXT_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "chore: bump to ${NEXT_VERSION} based on breaking change detection" @@ -133,9 +133,9 @@ echo "Bumping beta version" bump-my-version bump -vv prerelease_num # Update Cargo.lock files after version bump -cargo update -(cd python && cargo update) -(cd java/lance-jni && cargo update) +cargo update --workspace +(cd python && cargo update --workspace) +(cd java/lance-jni && cargo update --workspace) # Get new version NEW_VERSION=$(grep '^version = ' Cargo.toml | head -n1 | cut -d'"' -f2) diff --git a/ci/release_common.sh b/ci/release_common.sh index cd653212aae..573202d1689 100644 --- a/ci/release_common.sh +++ b/ci/release_common.sh @@ -29,9 +29,9 @@ bump_and_commit_version() { bump-my-version bump -vv --new-version "${NEW_VERSION}" --no-tag patch # Update Cargo.lock files after version bump - cargo update - (cd python && cargo update) - (cd java/lance-jni && cargo update) + cargo update --workspace + (cd python && cargo update --workspace) + (cd java/lance-jni && cargo update --workspace) git add -A git commit -m "${COMMIT_MESSAGE}" From 629dc8ebcdfe010297537462abdd380c7f274b55 Mon Sep 17 00:00:00 2001 From: YueZhang <69956021+zhangyue19921010@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:29:28 +0800 Subject: [PATCH 152/177] fix: apply per-segment filters and frag-reuse remap in BTree segment merge (#7320) Closes: https://github.com/lance-format/lance/issues/7230 Co-authored-by: zhangyue19921010 --- rust/lance-index/src/scalar/btree.rs | 49 ++++- rust/lance/src/index/append.rs | 278 +++++++++++++++++++++++++-- rust/lance/src/index/scalar/btree.rs | 39 +--- 3 files changed, 311 insertions(+), 55 deletions(-) diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index cd24f251718..85c42e9b048 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -73,7 +73,7 @@ use lance_datafusion::{ chunker::chunk_concat_stream, exec::{LanceExecutionOptions, OneShotExec, execute_plan}, }; -use lance_select::NullableRowAddrSet; +use lance_select::{NullableRowAddrSet, RowSetOps}; use log::{debug, warn}; use object_store::Error as ObjectStoreError; use rangemap::RangeInclusiveMap; @@ -1832,7 +1832,7 @@ impl BTreeIndex { segments: &[Arc], new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, - old_data_filter: Option, + old_data_filters: &[Option], ) -> Result { let Some(first) = segments.first() else { return Err(Error::invalid_input( @@ -1840,6 +1840,15 @@ impl BTreeIndex { )); }; + if old_data_filters.len() != segments.len() { + return Err(Error::invalid_input(format!( + "BTree merge: expected one old-data filter per source segment \ + (segments={}, filters={})", + segments.len(), + old_data_filters.len() + ))); + } + for segment in segments.iter().skip(1) { if segment.data_type != first.data_type { return Err(Error::index(format!( @@ -1861,14 +1870,20 @@ impl BTreeIndex { } let mut inputs: Vec> = Vec::with_capacity(segments.len() + 1); - for segment in segments { + for (segment, old_data_filter) in segments.iter().zip(old_data_filters) { + if filter_keeps_nothing(old_data_filter) { + continue; + } let stream = segment.data_stream().await?; + let stream = match segment.frag_reuse_index.clone() { + Some(frag_reuse_index) => remap_row_ids(stream, frag_reuse_index), + None => stream, + }; let stream = match old_data_filter.clone() { Some(filter) => filter_row_ids(stream, filter), None => stream, }; - let exec = Arc::new(OneShotExec::new(stream)); - inputs.push(exec); + inputs.push(Arc::new(OneShotExec::new(stream))); } inputs.push(Arc::new(OneShotExec::new(new_data))); @@ -1923,6 +1938,28 @@ fn filter_row_ids( Box::pin(RecordBatchStreamAdapter::new(schema, filtered)) } +/// True if `filter` would keep no rows at all (its keep-set is empty), letting +/// the merge skip reading the segment entirely. +fn filter_keeps_nothing(filter: &Option) -> bool { + match filter { + Some(OldIndexDataFilter::Fragments { to_keep, .. }) => to_keep.is_empty(), + Some(OldIndexDataFilter::RowIds(valid)) => valid.is_empty(), + None => false, + } +} + +fn remap_row_ids( + stream: SendableRecordBatchStream, + frag_reuse_index: Arc, +) -> SendableRecordBatchStream { + let schema = stream.schema(); + let remapped = stream.map(move |batch_result| { + let batch = batch_result?; + Ok(frag_reuse_index.remap_row_ids_record_batch(batch, 1)?) + }); + Box::pin(RecordBatchStreamAdapter::new(schema, remapped)) +} + fn wrap_bound(bound: &Bound) -> Bound { match bound { Bound::Unbounded => Bound::Unbounded, @@ -2283,7 +2320,7 @@ impl ScalarIndex for BTreeIndex { &[Arc::new(self.clone())], new_data, dest_store, - old_data_filter, + &[old_data_filter], ) .await } diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index c4a83bf5b31..99ff7bebe43 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -94,6 +94,54 @@ pub async fn build_old_data_filter( } } +/// Split the stored fragment coverage of `segments` into fragments still live in +/// `dataset` (`effective`) and fragments that compaction or deletion has already +/// retired (`deleted`). +pub fn split_segment_coverage<'a>( + dataset: &Dataset, + segments: impl IntoIterator, +) -> (RoaringBitmap, RoaringBitmap) { + let mut effective = RoaringBitmap::new(); + let mut deleted = RoaringBitmap::new(); + for segment in segments { + if let Some(eff) = segment.effective_fragment_bitmap(&dataset.fragment_bitmap) { + effective |= eff; + } + if let Some(del) = segment.deleted_fragment_bitmap(&dataset.fragment_bitmap) { + deleted |= del; + } + } + (effective, deleted) +} + +/// Build one [`OldIndexDataFilter`] per segment, each derived from that segment's +/// *own* effective (still-live) and retired fragment coverage, plus the union of +/// every segment's still-live coverage. +pub async fn build_per_segment_filters( + dataset: &Dataset, + segments: &[&IndexMetadata], +) -> Result<(RoaringBitmap, Vec>)> { + let mut effective_union = RoaringBitmap::new(); + let mut filters = Vec::with_capacity(segments.len()); + for segment in segments { + if segment.fragment_bitmap.is_none() { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} is missing fragment coverage", + segment.uuid + ))); + } + let effective = segment + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + let deleted = segment + .deleted_fragment_bitmap(&dataset.fragment_bitmap) + .unwrap_or_default(); + effective_union |= &effective; + filters.push(build_old_data_filter(dataset, &effective, &deleted).await?); + } + Ok((effective_union, filters)) +} + async fn load_unindexed_training_data( dataset: &Dataset, field_path: &str, @@ -191,16 +239,8 @@ async fn merge_scalar_indices<'a>( let update_criteria = reference_index.update_criteria(); // Effective = bitmap ∩ live fragments; deleted = bitmap \ live fragments. - let mut effective_old_frags = RoaringBitmap::new(); - let mut deleted_old_frags = RoaringBitmap::new(); - for idx in selected_old_indices { - if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) { - effective_old_frags |= effective; - } - if let Some(deleted) = idx.deleted_fragment_bitmap(&dataset.fragment_bitmap) { - deleted_old_frags |= deleted; - } - } + let (effective_old_frags, deleted_old_frags) = + split_segment_coverage(dataset.as_ref(), selected_old_indices.iter().copied()); let mut frag_bitmap = base_unindexed_bitmap.clone(); frag_bitmap |= &effective_old_frags; @@ -240,23 +280,28 @@ async fn merge_scalar_indices<'a>( load_unindexed_training_data(dataset.as_ref(), field_path, &update_criteria, unindexed) .await?; let new_store = LanceIndexStore::from_dataset_for_new(&dataset, &new_uuid)?; - let old_data_filter = - build_old_data_filter(dataset.as_ref(), &effective_old_frags, &deleted_old_frags) - .await?; match index_type { IndexType::BTree => { + let (_, old_data_filters) = + build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; crate::index::scalar::btree::open_and_merge_segments( dataset.as_ref(), field_path, selected_old_indices, new_data_stream, &new_store, - old_data_filter, + &old_data_filters, ) .await? } _ => { + let old_data_filter = build_old_data_filter( + dataset.as_ref(), + &effective_old_frags, + &deleted_old_frags, + ) + .await?; reference_index .update(new_data_stream, &new_store, old_data_filter) .await? @@ -740,7 +785,7 @@ mod tests { use arrow::datatypes::{Float32Type, UInt32Type}; use arrow_array::cast::AsArray; use arrow_array::{ - FixedSizeListArray, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, + FixedSizeListArray, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt32Array, }; use arrow_schema::{DataType, Field, Schema}; use futures::TryStreamExt; @@ -760,7 +805,7 @@ mod tests { use rstest::rstest; use crate::dataset::builder::DatasetBuilder; - use crate::dataset::optimize::compact_files; + use crate::dataset::optimize::{CompactionOptions, compact_files}; use crate::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; use crate::index::vector::VectorIndexParams; use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; @@ -1933,4 +1978,205 @@ mod tests { assert_eq!(after_default[0].uuid, original_uuid); assert_eq!(dataset.manifest.version, original_version); } + + #[rstest] + #[case::address_row_ids(false)] + #[case::stable_row_ids(true)] + #[tokio::test] + async fn test_optimize_btree_no_duplicate_row_addr(#[case] use_stable_row_ids: bool) { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("payload", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1])), + Arc::new(Int32Array::from(vec![10])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let write_params = WriteParams { + enable_stable_row_ids: use_stable_row_ids, + ..Default::default() + }; + let mut dataset = Dataset::write(reader, test_uri, Some(write_params)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Reordered source columns (payload, id) force the partial-schema + // RewriteColumns path instead of a full row rewrite. + let source_schema = Arc::new(Schema::new(vec![ + Field::new("payload", DataType::Int32, false), + Field::new("id", DataType::Int32, false), + ])); + let source_batch = RecordBatch::try_new( + source_schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![100])), + Arc::new(Int32Array::from(vec![1])), + ], + ) + .unwrap(); + let merge_job = + MergeInsertBuilder::try_new(Arc::new(dataset.clone()), vec!["id".to_string()]) + .unwrap() + .when_matched(WhenMatched::UpdateAll) + .try_build() + .unwrap(); + let source_reader = Box::new(RecordBatchIterator::new( + [Ok(source_batch)], + source_schema.clone(), + )); + merge_job + .execute(reader_to_stream(source_reader)) + .await + .unwrap(); + + // Build a delta BTree segment over the now-unindexed fragment. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .optimize_indices(&OptimizeOptions::append()) + .await + .unwrap(); + assert_eq!( + dataset.load_indices_by_name("id_idx").await.unwrap().len(), + 2, + "append must create a delta segment over the rewritten fragment" + ); + + // Force the old segment + delta segment to merge. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let rows = dataset + .scan() + .filter("id = 1") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(rows, 1, "id = 1 must return exactly one row after merge"); + } + + #[tokio::test] + async fn test_optimize_btree_merge_remaps_deferred_compaction() { + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let make = |range: std::ops::Range| { + RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from_iter_values(range))], + ) + .unwrap() + }; + + // Two fragments: [0, 50) and [50, 100). + let reader = + RecordBatchIterator::new(vec![Ok(make(0..50)), Ok(make(50..100))], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + assert_eq!(dataset.get_fragments().len(), 2); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + dataset + .create_index( + &["id"], + IndexType::BTree, + Some("id_idx".into()), + ¶ms, + true, + ) + .await + .unwrap(); + + // Deferred-remap compaction fuses the two fragments into one and leaves a + // pending FragReuseIndex; the index segment is not eagerly remapped. + compact_files( + &mut dataset, + CompactionOptions { + defer_index_remap: true, + ..Default::default() + }, + None, + ) + .await + .unwrap(); + + // Append a third fragment, left unindexed. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + dataset + .append( + RecordBatchIterator::new(vec![Ok(make(100..150))], schema.clone()), + None, + ) + .await + .unwrap(); + + // Merge the deferred-remapped old segment with the new delta. + dataset + .optimize_indices(&OptimizeOptions::merge(2)) + .await + .unwrap(); + + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + // A value from the compacted fragments must still be found via the index. + let hit = dataset + .scan() + .filter("id = 25") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!( + hit, 1, + "compacted-then-merged row must remain queryable via the index" + ); + let total = dataset + .scan() + .filter("id >= 0") + .unwrap() + .project(&["id"]) + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(total, 150, "no rows may be lost across compaction + merge"); + } } diff --git a/rust/lance/src/index/scalar/btree.rs b/rust/lance/src/index/scalar/btree.rs index eca023d7474..4339b8c183b 100644 --- a/rust/lance/src/index/scalar/btree.rs +++ b/rust/lance/src/index/scalar/btree.rs @@ -17,7 +17,6 @@ use lance_index::scalar::lance_format::LanceIndexStore; use lance_index::scalar::registry::VALUE_COLUMN_NAME; use lance_index::scalar::{CreatedIndex, OldIndexDataFilter}; use lance_table::format::IndexMetadata; -use roaring::RoaringBitmap; use uuid::Uuid; use crate::{Dataset, Error, Result, dataset::index::LanceIndexStoreExt}; @@ -64,7 +63,7 @@ pub(crate) async fn open_and_merge_segments( segments: &[&IndexMetadata], new_data: SendableRecordBatchStream, new_store: &LanceIndexStore, - old_data_filter: Option, + old_data_filters: &[Option], ) -> Result { let mut source_indices = Vec::with_capacity(segments.len()); for &segment in segments { @@ -82,7 +81,7 @@ pub(crate) async fn open_and_merge_segments( })?; source_indices.push(Arc::new(btree.clone())); } - BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filter).await + BTreeIndex::merge_segments(&source_indices, new_data, new_store, old_data_filters).await } /// Merge one caller-defined group of source BTree segments into a single @@ -118,48 +117,22 @@ pub(crate) async fn merge_segments( })?; let field_path = dataset.schema().field_path(field_id)?; - // Intersect each segment's stored bitmap with the dataset's current - // fragments so we don't claim coverage on IDs that compaction or pruning - // has already retired. - let dataset_fragments = dataset.fragment_bitmap.as_ref(); - let mut effective_old_frags = RoaringBitmap::new(); - let mut deleted_old_frags = RoaringBitmap::new(); - for segment in &segments { - if segment.fragment_bitmap.is_none() { - return Err(Error::invalid_input(format!( - "CreateIndex: segment {} is missing fragment coverage", - segment.uuid - ))); - } - if let Some(effective) = segment.effective_fragment_bitmap(dataset_fragments) { - effective_old_frags |= effective; - } - if let Some(deleted) = segment.deleted_fragment_bitmap(dataset_fragments) { - deleted_old_frags |= deleted; - } - } - - let fragment_bitmap = effective_old_frags.clone(); - let old_data_filter = crate::index::append::build_old_data_filter( - dataset, - &effective_old_frags, - &deleted_old_frags, - ) - .await?; + let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); + let (fragment_bitmap, old_data_filters) = + crate::index::append::build_per_segment_filters(dataset, &segment_refs).await?; let output_uuid = Uuid::new_v4(); let new_store = LanceIndexStore::from_dataset_for_new(dataset, &output_uuid)?; // Pure segment consolidation: no dataset scan, so `new_data` is an empty // stream and the merge is driven entirely by the source page data. let empty_new_data = empty_btree_update_stream(dataset, field_id)?; - let segment_refs: Vec<&IndexMetadata> = segments.iter().collect(); let created_index = open_and_merge_segments( dataset, &field_path, &segment_refs, empty_new_data, &new_store, - old_data_filter, + &old_data_filters, ) .await?; From 75bc428860fa2d1427423216c3ea1de28bfabd97 Mon Sep 17 00:00:00 2001 From: Armaan Sandhu <74664101+Ar-maan05@users.noreply.github.com> Date: Fri, 19 Jun 2026 13:26:18 +0530 Subject: [PATCH 153/177] fix: evaluate all list-element docs in FTS prefilter walk-the-allowlist branch (#7246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem FTS `search()` combined with a `where(...)` prefilter on a `list` / `large_list` column silently drops matches when the query token sits at any position **other than the last** in a row's list. `.postfilter()` (FTS first, then filter) returns the correct rows. Reported as lancedb#3352 with a runnable Python repro. The plan is `MatchQuery > ScalarIndexQuery`, and the bug only surfaces when the planner picks the small-allowlist prefilter path (`index_comparisons ≈ allowlist size`): | Target row `keywords` | prefilter (default) | postfilter | |---|---|---| | `["needle", "synonym"]` | **0 rows (bug)** | 2 rows | | `["synonym", "needle"]` | 2 rows | 2 rows | ## Root cause A list column indexes every element as its own document, so one `row_id` owns several `doc_id`s: `DocSet.inv` (a `Vec<(row_id, doc_id)>` sorted by `row_id`) holds multiple entries per row. `DocSet::doc_id(row_id)` resolved a row to a **single** `doc_id` via `binary_search_by_key`, and its only caller is `Wand::flat_search`: the walk-the-allowlist prefilter branch. It therefore evaluated just one of the row's documents against the posting lists; when the query token lived in any other element, the row became a false negative. The regular WAND path is forward-driven (document -> `row_id`, with a per-document mask check), so it was always correct, only `flat_search` was affected, which is why the bug is specific to the prefilter branch. ## Fix - Replace `DocSet::doc_id` with `DocSet::doc_ids(row_id) -> impl Iterator`, which yields every `doc_id` in the contiguous equal-key run in `inv` (the legacy `row_id == doc_id` shape still resolves to a single document). - `flat_search` now expands each allow-listed `row_id` to **all** of its documents (`flat_map` over `doc_ids`) before sorting into doc-id order. This brings `flat_search` to parity with the WAND path, so it introduces no new duplicate-row behaviour: only documents actually present in the posting lists score. ## Tests - `test_doc_ids_resolves_every_document_a_row_owns`: unit coverage of the multi-valued resolution (list shape, legacy shape, and a missing row). - `test_flat_search_finds_list_row_with_match_at_non_last_position` (rstest, compressed + plain): reproduces the bug; it fails on the previous single-`doc_id` resolution and passes with the fix. All 143 `scalar::inverted` tests pass; `cargo fmt --all --check` and `cargo clippy -p lance-index --tests -- -D warnings` are clean. Closes lancedb#3352 --- rust/lance-index/src/scalar/inverted/index.rs | 29 ++++--- rust/lance-index/src/scalar/inverted/wand.rs | 79 ++++++++++++++++++- 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 1082e1dc371..41a18c3bd68 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -37,7 +37,7 @@ use datafusion::physical_plan::metrics::Time; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use fst::{Automaton, IntoStreamer, Streamer}; use futures::{FutureExt, Stream, StreamExt, TryStreamExt, stream}; -use itertools::Itertools; +use itertools::{Either, Itertools}; use lance_arrow::{RecordBatchExt, iter_str_array}; use lance_core::cache::{CacheCodec, CacheKey, LanceCache, WeakLanceCache}; use lance_core::deepsize::DeepSizeOf; @@ -4615,18 +4615,25 @@ impl DocSet { self.row_ids[doc_id as usize] } - pub fn doc_id(&self, row_id: u64) -> Option { + /// Resolve a `row_id` to every `doc_id` it owns. + /// + /// A scalar column maps each row to a single document, but a + /// `list` column indexes every element as its own document, so a + /// single `row_id` can own several `doc_id`s sharing that key in `inv`. + /// The prefilter path (`flat_search`) walks an allow-list of row_ids and + /// must evaluate *all* of a row's documents; resolving to one `doc_id` + /// silently drops matches at non-last list positions (lancedb#3352). + pub fn doc_ids(&self, row_id: u64) -> impl Iterator + '_ { if self.inv.is_empty() { - // in legacy format, the row id is doc id - match self.row_ids.binary_search(&row_id) { - Ok(_) => Some(row_id), - Err(_) => None, - } + // in legacy format, the row id is doc id (one document per row) + let found = self.row_ids.binary_search(&row_id).is_ok(); + Either::Left(found.then_some(row_id).into_iter()) } else { - match self.inv.binary_search_by_key(&row_id, |x| x.0) { - Ok(idx) => Some(self.inv[idx].1 as u64), - Err(_) => None, - } + // `inv` is sorted by row_id, so the entries sharing this key form a + // contiguous run; yield the doc_id of each. + let lo = self.inv.partition_point(|entry| entry.0 < row_id); + let hi = self.inv.partition_point(|entry| entry.0 <= row_id); + Either::Right(self.inv[lo..hi].iter().map(|entry| entry.1 as u64)) } } pub fn total_tokens_num(&self) -> u64 { diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 212038ba432..dc6d2a860fb 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -829,11 +829,16 @@ impl<'a, S: Scorer> Wand<'a, S> { } // we need to map the row ids to doc ids, and sort them, - // because WAND PostingIterator can't go back to the previous doc id + // because WAND PostingIterator can't go back to the previous doc id. + // A list column maps one row id to several doc ids, so expand every + // document the row owns — keying on a single doc id would drop matches + // at non-last list positions (lancedb#3352). let doc_ids = row_ids - .filter_map(|row_addr| { + .flat_map(|row_addr| { let row_id: u64 = row_addr.into(); - self.docs.doc_id(row_id).map(|doc_id| (doc_id, row_id)) + self.docs + .doc_ids(row_id) + .map(move |doc_id| (doc_id, row_id)) }) .sorted_unstable() .collect::>(); @@ -2213,6 +2218,74 @@ mod tests { assert_eq!(matched, vec![2]); } + #[test] + fn test_doc_ids_resolves_every_document_a_row_owns() { + // A list column indexes each element as its own document, so + // one row id owns several doc ids. row 100 -> {0, 1}, row 101 -> {2}. + let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]); + let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]); + let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap(); + + assert_eq!(docs.doc_ids(100).collect::>(), vec![0, 1]); + assert_eq!(docs.doc_ids(101).collect::>(), vec![2]); + assert!(docs.doc_ids(999).next().is_none()); + + // legacy shape (row id == doc id) still resolves to a single document. + let mut legacy = DocSet::default(); + legacy.append(7, 1); + assert_eq!(legacy.doc_ids(7).collect::>(), vec![7]); + assert!(legacy.doc_ids(8).next().is_none()); + } + + #[rstest] + fn test_flat_search_finds_list_row_with_match_at_non_last_position( + #[values(false, true)] is_compressed: bool, + ) { + // row 100 owns two element-documents (doc 0, doc 1) that share its row + // id; row 101 owns doc 2. The query term lives only in doc 0 — the + // *non-last* element of row 100. Resolving the row to a single doc id + // would evaluate doc 1, miss the term, and drop the row (lancedb#3352). + let row_id_col = arrow_array::UInt64Array::from(vec![100_u64, 100, 101]); + let num_tokens_col = arrow_array::UInt32Array::from(vec![1_u32, 1, 1]); + let docs = DocSet::from_columns(&row_id_col, &num_tokens_col, false, None).unwrap(); + + let posting = PostingIterator::with_query_weight( + String::from("needle"), + 0, + 0, + 1.0, + generate_posting_list(vec![0], 1.0, None, is_compressed), + docs.len(), + ); + + let mut wand = Wand::new( + Operator::Or, + vec![posting].into_iter(), + &docs, + InverseDocLengthScorer, + ); + wand.threshold = 0.5; + + let selected = vec![RowAddress::from(100_u64)]; + let result = wand + .flat_search( + &FtsSearchParams::default(), + Box::new(selected.into_iter()), + &NoOpMetricsCollector, + ) + .unwrap(); + + // flat_search resolves the prefilter against the DocSet, so the single + // match comes back as a concrete RowId(100) rather than a deferred + // Pending addr. Asserting on the whole result avoids a never-taken + // match arm that would otherwise read as uncovered. + let addrs = result.into_iter().map(|doc| doc.addr).collect::>(); + assert!( + matches!(addrs.as_slice(), [CandidateAddr::RowId(100)]), + "expected exactly row 100, got {addrs:?}" + ); + } + #[test] fn test_block_max_score_matches_stored_value() { let doc_ids = vec![0_u32]; From b15865f2717d3555b83c7546ca32cad61dce7d70 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 19 Jun 2026 08:03:25 -0700 Subject: [PATCH 154/177] chore: drop Python 3.9 support, set 3.10 as minimum (#7345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python 3.9 is reaching end-of-life. This removes it from CI test jobs and released binaries and makes Python 3.10 the minimum supported version. - Bump `requires-python` to `>=3.10` and drop the `Python :: 3.9` classifier in `python/pyproject.toml` and `memtest/pyproject.toml`. - Raise the PyO3 abi3 floor from `abi3-py39` to `abi3-py310`. - Update the `python.yml` test matrix (3.10 + 3.13) and the `pypi-publish.yml` release wheel matrices to build for 3.10. - Drop the now-redundant `python_version >= '3.10'` dependency markers and regenerate `uv.lock`. - Update `CONTRIBUTING.md` and the build-action input docs. `docker-compose.yml`'s `version: "3.9"` is the Compose file-format version (not Python) and is left unchanged. No `sys.version_info` checks for 3.9 exist in the source. Closes #7344 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.8 (1M context) --- .../workflows/build_linux_wheel/action.yml | 2 +- .github/workflows/build_mac_wheel/action.yml | 2 +- .../workflows/build_windows_wheel/action.yml | 2 +- .github/workflows/pypi-publish.yml | 6 +- .github/workflows/python.yml | 2 +- CONTRIBUTING.md | 2 +- memtest/pyproject.toml | 3 +- python/Cargo.toml | 2 +- python/pyproject.toml | 11 +- python/uv.lock | 767 +++--------------- 10 files changed, 120 insertions(+), 679 deletions(-) diff --git a/.github/workflows/build_linux_wheel/action.yml b/.github/workflows/build_linux_wheel/action.yml index 9016ae67b1a..d6e6e0f1ada 100644 --- a/.github/workflows/build_linux_wheel/action.yml +++ b/.github/workflows/build_linux_wheel/action.yml @@ -3,7 +3,7 @@ name: build-linux-wheel description: "Build a manylinux wheel for lance" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/build_mac_wheel/action.yml b/.github/workflows/build_mac_wheel/action.yml index 9d45bde42aa..0cac76c49cf 100644 --- a/.github/workflows/build_mac_wheel/action.yml +++ b/.github/workflows/build_mac_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/build_windows_wheel/action.yml b/.github/workflows/build_windows_wheel/action.yml index 03b601db019..94475059c75 100644 --- a/.github/workflows/build_windows_wheel/action.yml +++ b/.github/workflows/build_windows_wheel/action.yml @@ -3,7 +3,7 @@ name: build_wheel description: "Build a lance wheel" inputs: python-minor-version: - description: "9, 10, 11, 12" + description: "10, 11, 12, 13" required: true args: description: "--release" diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index b2bfe284fb5..77c76d6fc69 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -35,7 +35,7 @@ jobs: name: Python Linux 3.${{ matrix.python-minor-version }} ${{ matrix.config.platform }} manylinux${{ matrix.config.manylinux }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - platform: x86_64 manylinux: "2_17" @@ -101,7 +101,7 @@ jobs: runs-on: ${{ matrix.config.runner }} strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] config: - target: aarch64-apple-darwin runner: warp-macos-14-arm64-6x @@ -152,7 +152,7 @@ jobs: runs-on: windows-latest-4x strategy: matrix: - python-minor-version: ["9"] + python-minor-version: ["10"] steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index f9bb3132b38..cce465807e3 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -97,7 +97,7 @@ jobs: timeout-minutes: 45 strategy: matrix: - python-minor-version: ["9", "13"] + python-minor-version: ["10", "13"] name: "Python Linux 3.${{ matrix.python-minor-version }} x86_64" runs-on: "ubuntu-24.04-4x" defaults: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cf332215e49..8f3ec285f31 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,7 +19,7 @@ If you have any questions, please join our [Discord](https://discord.gg/zMM32dvN Currently Lance is implemented in Rust and comes with a Python wrapper. So you'll want to make sure you setup both. 1. Install Rust: https://www.rust-lang.org/tools/install -2. Install Python 3.9+: https://www.python.org/downloads/ +2. Install Python 3.10+: https://www.python.org/downloads/ 3. Install protoctol buffers: https://grpc.io/docs/protoc-installation/ (make sure you have version 3.20 or higher) 4. Install commit hooks: a. Install pre-commit: https://pre-commit.com/#install diff --git a/memtest/pyproject.toml b/memtest/pyproject.toml index 396d7c442e0..4418d0e19c8 100644 --- a/memtest/pyproject.toml +++ b/memtest/pyproject.toml @@ -7,7 +7,7 @@ name = "lance-memtest" version = "0.1.0" description = "Memory allocation testing utilities for Python test suites" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache-2.0" } authors = [ { name = "Lance Developers" } @@ -17,7 +17,6 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", diff --git a/python/Cargo.toml b/python/Cargo.toml index c90aa34a23b..240c046e5ff 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -60,7 +60,7 @@ prost = "0.14.1" prost-types = "0.14.1" pyo3 = { version = "0.28", features = [ "extension-module", - "abi3-py39", + "abi3-py310", "py-clone", "chrono", ] } diff --git a/python/pyproject.toml b/python/pyproject.toml index d2efab23579..d863fe38517 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -7,7 +7,7 @@ authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } repository = "https://github.com/lancedb/lance" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" keywords = [ "data-format", "data-science", @@ -30,7 +30,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -61,7 +60,7 @@ tests = [ # Only test tensorflow on linux for now. We will deprecate tensorflow soon. "tensorflow; sys_platform == 'linux'", "tqdm", - "datafusion>=53,<54; python_version >= '3.10'", + "datafusion>=53,<54", ] dev = ["ruff==0.11.2", "pyright"] benchmarks = ["pytest-benchmark"] @@ -74,7 +73,7 @@ geo = [ [dependency-groups] tests = [ "boto3==1.40.43", - "datasets==4.1.1; python_version >= '3.10'", + "datasets==4.1.1", "duckdb==1.4.0", "ml_dtypes==0.5.3", "pillow==11.3.0", @@ -82,9 +81,9 @@ tests = [ "polars[pyarrow,pandas]==1.34.0", "psutil==7.1.0", "pytest==8.4.2", - "tensorflow==2.20.0; sys_platform == 'linux' and python_version >= '3.10'", + "tensorflow==2.20.0; sys_platform == 'linux'", "tqdm==4.67.1", - "datafusion==53.0.0; python_version >= '3.10'", + "datafusion==53.0.0", ] dev = [ "maturin==1.13.3", diff --git a/python/uv.lock b/python/uv.lock index 57a73fc9a62..5f1fa45d755 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1,13 +1,12 @@ version = 1 revision = 3 -requires-python = ">=3.9" +requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.14'", "python_full_version == '3.13.*'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] [[package]] @@ -33,15 +32,15 @@ name = "aiohttp" version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "aiohappyeyeballs", marker = "python_full_version >= '3.10'" }, - { name = "aiosignal", marker = "python_full_version >= '3.10'" }, - { name = "async-timeout", marker = "python_full_version == '3.10.*'" }, - { name = "attrs", marker = "python_full_version >= '3.10'" }, - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, - { name = "yarl", marker = "python_full_version >= '3.10'" }, + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "async-timeout", marker = "python_full_version < '3.11'" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "yarl" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ee/ab/93ce242f899b68c51b0578c027aafa791ab3614cb9345fa5d37b5f5c8e3e/aiohttp-3.14.0.tar.gz", hash = "sha256:2882de819734c715fd1b9c11c97e09fa020d14438203d1d354d8ed1702791c9b", size = 7940674, upload-time = "2026-06-01T19:41:02.763Z" } wheels = [ @@ -170,8 +169,8 @@ name = "aiosignal" version = "1.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "frozenlist", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "frozenlist" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } wheels = [ @@ -235,19 +234,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/8a/79c76ad88b16f2fac25684f7313593738f353355eb1af2307e43efd7b1ca/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:de74a2512e2e2366d4b064c498c38672bf6ddea38acec8b1999b4e66182dd001", size = 3104663, upload-time = "2025-10-13T23:11:00.582Z" }, { url = "https://files.pythonhosted.org/packages/20/66/9152feaa87f851a37c1a2bd74fb89d7e82e4c76447ee590bf8e6fff5e9d8/arro3_core-0.6.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:806ca8e20507675b2de68b3d009f76e898cc3c3e441c834ea5220866f68aac50", size = 2956440, upload-time = "2025-10-13T23:11:03.769Z" }, { url = "https://files.pythonhosted.org/packages/ad/66/f4179ef64d5c18fe76ec93cfbff42c0f401438ef771c6766b880044d7e13/arro3_core-0.6.5-cp313-cp313t-win_amd64.whl", hash = "sha256:8f6f0cc78877ade7ad6e678a4671b191406547e7b407bc9637436869c017ed47", size = 2845345, upload-time = "2025-10-13T23:11:07.447Z" }, - { url = "https://files.pythonhosted.org/packages/07/c2/407d6bc19813fb74cc2b087ad3e959e102b29ff81e35dcc0ad0dfb5b946c/arro3_core-0.6.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:dfac7fac3c6a302399d94644d48682a19488a5b67bd1ccbdf6c560a7ffabde6d", size = 2680237, upload-time = "2025-10-13T23:11:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/d3/73/c67156794d7e9734f4cc03d2eca7e44a1cc014686e6b7663f5110f58581d/arro3_core-0.6.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fc70042e558d1cd5fbe917b58e8ef52701441e38ff30b1912858050f796a62c", size = 2386228, upload-time = "2025-10-13T23:11:14.02Z" }, - { url = "https://files.pythonhosted.org/packages/79/e8/817ee1abb0cfa7e266ef00749b144553d2bb9c4679ca932ecbca9dc7dea9/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1583b29b2ba83927a33e5435e5d9d134114c45a6360a8bb4db4beda13dab4fd8", size = 2886476, upload-time = "2025-10-13T23:11:17.579Z" }, - { url = "https://files.pythonhosted.org/packages/8e/d6/1b9beceab797c4510abfc25ef6e657e4c940d06a9615927ce506463691dd/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a170fe53f18dda4a4647fd3b8b4a9373fc11ac42c41a4b65f55d79ad531a33e", size = 2911941, upload-time = "2025-10-13T23:11:21.131Z" }, - { url = "https://files.pythonhosted.org/packages/dc/ed/4fe1fb9a24698fe6189111836d22c9582cbc92fa159b24b8664e924738dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83047b4e6e18835c91c8d12c5494e6ababc7c185c5a772d3429e8f9b0c185894", size = 3150419, upload-time = "2025-10-13T23:11:24.503Z" }, - { url = "https://files.pythonhosted.org/packages/a1/91/d6215b782fa91493f504ae13623db889beeaf0519037c28fc6744464439a/arro3_core-0.6.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3d4393d281d1ef18927915a11187da27287d279f99d5325bc9afb417f76084f", size = 2777891, upload-time = "2025-10-13T23:11:28.11Z" }, - { url = "https://files.pythonhosted.org/packages/d4/de/0aa3504e6cbf406086de49b59cb0dcb3ab11f64acbb38602143e479831dc/arro3_core-0.6.5-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:f0c88d8babcf51affdd69390882e2f0ecb1890a1b8a5abfc087d003e7181eb6e", size = 2519673, upload-time = "2025-10-13T23:11:31.426Z" }, - { url = "https://files.pythonhosted.org/packages/05/69/47bf9c9ab66bafc7056a41f6db9d2149639eea6417299e3fe6c01ef99b6c/arro3_core-0.6.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36424e1d62212466a5cacdc27d414e99bf0fdab1544cc2b7e5b81e41437e5970", size = 3026254, upload-time = "2025-10-13T23:11:36.199Z" }, - { url = "https://files.pythonhosted.org/packages/b1/e8/638582437ab41ba52d3c7f2a1b0a98e4a05a51e3f660985e594b4f6c18d5/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4eb4d96f7db618f100758a8b7ec1b221c8737d543073701b7ffee74bc5019d46", size = 2704582, upload-time = "2025-10-13T23:11:39.408Z" }, - { url = "https://files.pythonhosted.org/packages/aa/0a/7bc46ee799459cce72a2e15b0eb184170f26cac37eace0b813e855fbc4d8/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2cfe9b4b1dd663d256754f1aa7aae783a1cddd3eb5698892b9caf381431f0af7", size = 3155815, upload-time = "2025-10-13T23:11:43.304Z" }, - { url = "https://files.pythonhosted.org/packages/99/8a/f20eff8f4ff5bd7db9b37b70ea058b37375a930a10e03d584a7597b6b740/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a3b2621505f97eb5ce80f1c6fa8c77d18d757ab48d1f11d33a805e9ccbcd6fb6", size = 3107791, upload-time = "2025-10-13T23:11:46.735Z" }, - { url = "https://files.pythonhosted.org/packages/79/da/60c66f0cc4a6af7f54e57973190540f77b84da1218fad2a9917e17bd897b/arro3_core-0.6.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6c1becbb96ceba0b20f3d4318dd35f3417ee9a49065813d99f52b0fa285fc569", size = 2957730, upload-time = "2025-10-13T23:11:49.875Z" }, - { url = "https://files.pythonhosted.org/packages/dd/8d/6e3235894196e1fd2be34e01ac2d4280dd24e6c9019e3b12603858651e91/arro3_core-0.6.5-cp39-cp39-win_amd64.whl", hash = "sha256:5459e7bd39bb9dd8c57aa06856d2bebc5c1ca782cbccab0e186c6c89530e4ca9", size = 2839298, upload-time = "2025-10-13T23:11:53.566Z" }, { url = "https://files.pythonhosted.org/packages/10/ca/b2139dbb25f9fefb9b1cdce8a73785615de6763af6a16bf6ff96a3b630f2/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:26d5b50139f1a96727fa1760b4d70393acf5ee0fba45346ad2d4f69824d3bdc2", size = 2676788, upload-time = "2025-10-13T23:11:56.965Z" }, { url = "https://files.pythonhosted.org/packages/34/a1/c68dde2944f493c8ccfcb91bf6da6d27a27c3674316dd09c9560f9e6ab1a/arro3_core-0.6.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b65b3d8d7f65f2f3c36002dc467380d7a31ea771132986dddc6341c5a9dc726f", size = 2382809, upload-time = "2025-10-13T23:12:00.175Z" }, { url = "https://files.pythonhosted.org/packages/c6/fc/2fb81d42a3cecd632deace97dc23ac74083d60d158106440c783bae4ff01/arro3_core-0.6.5-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c3442a79a757ed3fbd7793de180019ae3201f04237537c2e2e3f1e3dd99b31c", size = 2882818, upload-time = "2025-10-13T23:12:03.721Z" }, @@ -314,8 +300,7 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/d0/3888673417202262ddd7e6361cab8e01ee2705e39643af8445e2eb276eab/botocore-1.40.43.tar.gz", hash = "sha256:d87412dc1ea785df156f412627d3417c9f9eb45601fd0846d8fe96fe3c78b630", size = 14389164, upload-time = "2025-10-01T19:38:16.06Z" } wheels = [ @@ -392,17 +377,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/2a/aff5dd112b2f14bcc3462c312dce5445806bfc8ab3a7328555da95330e4b/charset_normalizer-3.4.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d716a916938e03231e86e43782ca7878fb602a125a91e7acb8b5112e2e96ac16", size = 152224, upload-time = "2025-08-09T07:56:51.369Z" }, { url = "https://files.pythonhosted.org/packages/b7/8c/9839225320046ed279c6e839d51f028342eb77c91c89b8ef2549f951f3ec/charset_normalizer-3.4.3-cp314-cp314-win32.whl", hash = "sha256:c6dbd0ccdda3a2ba7c2ecd9d77b37f3b5831687d8dc1b6ca5f56a4880cc7b7ce", size = 100086, upload-time = "2025-08-09T07:56:52.722Z" }, { url = "https://files.pythonhosted.org/packages/ee/7a/36fbcf646e41f710ce0a563c1c9a343c6edf9be80786edeb15b6f62e17db/charset_normalizer-3.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:73dc19b562516fc9bcf6e5d6e596df0b4eb98d87e4f79f3ae71840e6ed21361c", size = 107400, upload-time = "2025-08-09T07:56:55.172Z" }, - { url = "https://files.pythonhosted.org/packages/c2/ca/9a0983dd5c8e9733565cf3db4df2b0a2e9a82659fd8aa2a868ac6e4a991f/charset_normalizer-3.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:70bfc5f2c318afece2f5838ea5e4c3febada0be750fcf4775641052bbba14d05", size = 207520, upload-time = "2025-08-09T07:57:11.026Z" }, - { url = "https://files.pythonhosted.org/packages/39/c6/99271dc37243a4f925b09090493fb96c9333d7992c6187f5cfe5312008d2/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:23b6b24d74478dc833444cbd927c338349d6ae852ba53a0d02a2de1fce45b96e", size = 147307, upload-time = "2025-08-09T07:57:12.4Z" }, - { url = "https://files.pythonhosted.org/packages/e4/69/132eab043356bba06eb333cc2cc60c6340857d0a2e4ca6dc2b51312886b3/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:34a7f768e3f985abdb42841e20e17b330ad3aaf4bb7e7aeeb73db2e70f077b99", size = 160448, upload-time = "2025-08-09T07:57:13.712Z" }, - { url = "https://files.pythonhosted.org/packages/04/9a/914d294daa4809c57667b77470533e65def9c0be1ef8b4c1183a99170e9d/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb731e5deb0c7ef82d698b0f4c5bb724633ee2a489401594c5c88b02e6cb15f7", size = 157758, upload-time = "2025-08-09T07:57:14.979Z" }, - { url = "https://files.pythonhosted.org/packages/b0/a8/6f5bcf1bcf63cb45625f7c5cadca026121ff8a6c8a3256d8d8cd59302663/charset_normalizer-3.4.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:257f26fed7d7ff59921b78244f3cd93ed2af1800ff048c33f624c87475819dd7", size = 152487, upload-time = "2025-08-09T07:57:16.332Z" }, - { url = "https://files.pythonhosted.org/packages/c4/72/d3d0e9592f4e504f9dea08b8db270821c909558c353dc3b457ed2509f2fb/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1ef99f0456d3d46a50945c98de1774da86f8e992ab5c77865ea8b8195341fc19", size = 150054, upload-time = "2025-08-09T07:57:17.576Z" }, - { url = "https://files.pythonhosted.org/packages/20/30/5f64fe3981677fe63fa987b80e6c01042eb5ff653ff7cec1b7bd9268e54e/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2c322db9c8c89009a990ef07c3bcc9f011a3269bc06782f916cd3d9eed7c9312", size = 161703, upload-time = "2025-08-09T07:57:20.012Z" }, - { url = "https://files.pythonhosted.org/packages/e1/ef/dd08b2cac9284fd59e70f7d97382c33a3d0a926e45b15fc21b3308324ffd/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:511729f456829ef86ac41ca78c63a5cb55240ed23b4b737faca0eb1abb1c41bc", size = 159096, upload-time = "2025-08-09T07:57:21.329Z" }, - { url = "https://files.pythonhosted.org/packages/45/8c/dcef87cfc2b3f002a6478f38906f9040302c68aebe21468090e39cde1445/charset_normalizer-3.4.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:88ab34806dea0671532d3f82d82b85e8fc23d7b2dd12fa837978dad9bb392a34", size = 153852, upload-time = "2025-08-09T07:57:22.608Z" }, - { url = "https://files.pythonhosted.org/packages/63/86/9cbd533bd37883d467fcd1bd491b3547a3532d0fbb46de2b99feeebf185e/charset_normalizer-3.4.3-cp39-cp39-win32.whl", hash = "sha256:16a8770207946ac75703458e2c743631c79c59c5890c80011d536248f8eaa432", size = 99840, upload-time = "2025-08-09T07:57:23.883Z" }, - { url = "https://files.pythonhosted.org/packages/ce/d6/7e805c8e5c46ff9729c49950acc4ee0aeb55efb8b3a56687658ad10c3216/charset_normalizer-3.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:d22dbedd33326a4a5190dd4fe9e9e693ef12160c77382d9e87919bce54f3d4ca", size = 107438, upload-time = "2025-08-09T07:57:25.287Z" }, { url = "https://files.pythonhosted.org/packages/8a/1f/f041989e93b001bc4e44bb1669ccdcf54d3f00e628229a85b08d330615c5/charset_normalizer-3.4.3-py3-none-any.whl", hash = "sha256:ce571ab16d890d23b5c278547ba694193a45011ff86a9162a71307ed9f86759a", size = 53175, upload-time = "2025-08-09T07:57:26.864Z" }, ] @@ -420,9 +394,9 @@ name = "datafusion" version = "53.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/2b/0f96f12b70839c93930c4e17d767fc32b6c77d548c78784128049e944701/datafusion-53.0.0.tar.gz", hash = "sha256:ba9a5ec06b5453fbd8710d6aeeb515a8bcac4b6c140e254409bb53a5f322ef22", size = 224267, upload-time = "2026-04-13T00:45:02.686Z" } wheels = [ @@ -433,42 +407,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4b/1a/ea4831fc6aeefedbcf186c9f6a273d507b1787c03cbb905bded7e1149a6a/datafusion-53.0.0-cp310-abi3-win_amd64.whl", hash = "sha256:4c8410f5f659b926677be6c7d443bbc05d825c078c970b7d8cf977ebcf948314", size = 38120687, upload-time = "2026-04-13T00:45:00.633Z" }, ] -[[package]] -name = "datasets" -version = "0.0.9" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/cd/fe/4d2874473a753d59c83335691bd9532704f2605418a0d288a1d70fa003fc/datasets-0.0.9.zip", hash = "sha256:86d54441bab87aebb2aa3bf0853aa7fb7abed8c708f9bb08a88e86a498972010", size = 4013, upload-time = "2015-08-18T00:07:40.556Z" } - [[package]] name = "datasets" version = "4.1.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", extra = ["http"], marker = "python_full_version >= '3.10'" }, - { name = "huggingface-hub", marker = "python_full_version >= '3.10'" }, - { name = "multiprocess", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "dill" }, + { name = "filelock" }, + { name = "fsspec", extra = ["http"] }, + { name = "huggingface-hub" }, + { name = "multiprocess" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pandas", marker = "python_full_version >= '3.10'" }, - { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "packaging" }, + { name = "pandas" }, + { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "xxhash", marker = "python_full_version >= '3.10'" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "xxhash" }, ] sdist = { url = "https://files.pythonhosted.org/packages/91/a4/73f8e6ef52c535e1d20d5b2ca83bfe6de399d8b8b8a61ccc8d63d60735aa/datasets-4.1.1.tar.gz", hash = "sha256:7d8d5ba8b12861d2c44bfff9c83484ebfafff1ff553371e5901a8d3aab5450e2", size = 579324, upload-time = "2025-09-18T13:14:27.108Z" } wheels = [ @@ -514,12 +472,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/23/32/57866cf8881288b3dfb9212720221fb890daaa534dbdc6fe3fff3979ecd1/duckdb-1.4.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2de258a93435c977a0ec3a74ec8f60c2f215ddc73d427ee49adc4119558facd3", size = 18421289, upload-time = "2025-09-16T10:22:21.564Z" }, { url = "https://files.pythonhosted.org/packages/a0/83/7438fb43be451a7d4a04650aaaf662b2ff2d95895bbffe3e0e28cbe030c9/duckdb-1.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6d3659641d517dd9ed1ab66f110cdbdaa6900106f116effaf2dbedd83c38de3", size = 20426547, upload-time = "2025-09-16T10:22:23.759Z" }, { url = "https://files.pythonhosted.org/packages/21/b2/98fb89ae81611855f35984e96f648d871f3967bb3f524b51d1372d052f0c/duckdb-1.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:07fcc612ea5f0fe6032b92bcc93693034eb00e7a23eb9146576911d5326af4f7", size = 12290467, upload-time = "2025-09-16T10:22:25.923Z" }, - { url = "https://files.pythonhosted.org/packages/8d/42/0f355319b3e8ee1703d0e17378dd829db391434306621f85c110134f2763/duckdb-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:1c97ee61c582002b654331f7fd967d6b1e83bf7fdb0772f409dfd4b6af3a70f4", size = 31292373, upload-time = "2025-09-16T10:22:28.118Z" }, - { url = "https://files.pythonhosted.org/packages/fd/52/091dbef5eb2ac4e60a9c6d38fcc7c7530a75fafa0f37658450e8731a265b/duckdb-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:74e3d6295355160df5d3588b880e8bcae23fdd6f573f538793a8a1abf4c2c29d", size = 17288145, upload-time = "2025-09-16T10:22:30.346Z" }, - { url = "https://files.pythonhosted.org/packages/c9/6c/879317d9c3ac7a2a1f0618ca536a48ebfa4b9fe202f9783e07070e168192/duckdb-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0c76425e4ffe98069dd4fc4752ab919a4125dc0d176bb676b3065fdea152c42", size = 14816258, upload-time = "2025-09-16T10:22:32.442Z" }, - { url = "https://files.pythonhosted.org/packages/95/87/83ac8e67c0530b69fe39f91bbb7f3bd0a49b0c24216cffa9c5561fb2845c/duckdb-1.4.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c122bd7d80ab5057f53024ee3922d7612a5cdc99583fae730990964aebc3fd4", size = 18391043, upload-time = "2025-09-16T10:22:34.616Z" }, - { url = "https://files.pythonhosted.org/packages/d6/01/1d70bd6c594ef915c004edc0f1119d1602173dc5ce91c1eed7368f6aab34/duckdb-1.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:30689c1436bca723526be6102fe1f4f82ea6d4780fb9ca196bda7ed5ec227950", size = 20385348, upload-time = "2025-09-16T10:22:36.982Z" }, - { url = "https://files.pythonhosted.org/packages/b6/04/0650128cdcdc5208c4f51341a0a3f8db436ecaba51032c6065e20ea0baae/duckdb-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:4c55a367c1296617cff89c5e1c7153f1dc3c3b556ef70711a45b0236515f80c2", size = 12283322, upload-time = "2025-09-16T10:22:39.388Z" }, ] [[package]] @@ -543,29 +495,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] -[[package]] -name = "flatbuffers" -version = "2.0.7" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/d1/90/0532e737a11e1dc50e9e352c3ccc97338cb75991f83279c2edbc9234e022/flatbuffers-2.0.7.tar.gz", hash = "sha256:0ae7d69c5b82bf41962ca5fde9cc43033bc9501311d975fd5a25e8a7d29c1245", size = 22686, upload-time = "2022-08-23T22:50:07.903Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/0d/b5bfb553a6ac66d6ec2b6d7f1e814a908fba7188356ac94bb36ae3d905c3/flatbuffers-2.0.7-py2.py3-none-any.whl", hash = "sha256:71e135d533be527192819aaab757c5e3d109cb10fbb01e687f6bdb7a61ad39d1", size = 26562, upload-time = "2022-08-23T22:50:56.342Z" }, -] - [[package]] name = "flatbuffers" version = "25.9.23" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" }, @@ -662,23 +595,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/37/5f9f3c3fd7f7746082ec67bcdc204db72dad081f4f83a503d33220a92973/frozenlist-1.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:1a85e345b4c43db8b842cab1feb41be5cc0b10a1830e6295b69d7310f99becaf", size = 282620, upload-time = "2025-06-09T23:02:00.493Z" }, { url = "https://files.pythonhosted.org/packages/0b/31/8fbc5af2d183bff20f21aa743b4088eac4445d2bb1cdece449ae80e4e2d1/frozenlist-1.7.0-cp313-cp313t-win32.whl", hash = "sha256:3a14027124ddb70dfcee5148979998066897e79f89f64b13328595c4bdf77c81", size = 43059, upload-time = "2025-06-09T23:02:02.072Z" }, { url = "https://files.pythonhosted.org/packages/bb/ed/41956f52105b8dbc26e457c5705340c67c8cc2b79f394b79bffc09d0e938/frozenlist-1.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3bf8010d71d4507775f658e9823210b7427be36625b387221642725b515dcf3e", size = 47516, upload-time = "2025-06-09T23:02:03.779Z" }, - { url = "https://files.pythonhosted.org/packages/dd/b1/ee59496f51cd244039330015d60f13ce5a54a0f2bd8d79e4a4a375ab7469/frozenlist-1.7.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cea3dbd15aea1341ea2de490574a4a37ca080b2ae24e4b4f4b51b9057b4c3630", size = 82434, upload-time = "2025-06-09T23:02:05.195Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/d518391ce36a6279b3fa5bc14327dde80bcb646bb50d059c6ca0756b8d05/frozenlist-1.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d536ee086b23fecc36c2073c371572374ff50ef4db515e4e503925361c24f71", size = 48232, upload-time = "2025-06-09T23:02:07.728Z" }, - { url = "https://files.pythonhosted.org/packages/b7/8d/a0d04f28b6e821a9685c22e67b5fb798a5a7b68752f104bfbc2dccf080c4/frozenlist-1.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dfcebf56f703cb2e346315431699f00db126d158455e513bd14089d992101e44", size = 47186, upload-time = "2025-06-09T23:02:09.243Z" }, - { url = "https://files.pythonhosted.org/packages/93/3a/a5334c0535c8b7c78eeabda1579179e44fe3d644e07118e59a2276dedaf1/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974c5336e61d6e7eb1ea5b929cb645e882aadab0095c5a6974a111e6479f8878", size = 226617, upload-time = "2025-06-09T23:02:10.949Z" }, - { url = "https://files.pythonhosted.org/packages/0a/67/8258d971f519dc3f278c55069a775096cda6610a267b53f6248152b72b2f/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c70db4a0ab5ab20878432c40563573229a7ed9241506181bba12f6b7d0dc41cb", size = 224179, upload-time = "2025-06-09T23:02:12.603Z" }, - { url = "https://files.pythonhosted.org/packages/fc/89/8225905bf889b97c6d935dd3aeb45668461e59d415cb019619383a8a7c3b/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1137b78384eebaf70560a36b7b229f752fb64d463d38d1304939984d5cb887b6", size = 235783, upload-time = "2025-06-09T23:02:14.678Z" }, - { url = "https://files.pythonhosted.org/packages/54/6e/ef52375aa93d4bc510d061df06205fa6dcfd94cd631dd22956b09128f0d4/frozenlist-1.7.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e793a9f01b3e8b5c0bc646fb59140ce0efcc580d22a3468d70766091beb81b35", size = 229210, upload-time = "2025-06-09T23:02:16.313Z" }, - { url = "https://files.pythonhosted.org/packages/ee/55/62c87d1a6547bfbcd645df10432c129100c5bd0fd92a384de6e3378b07c1/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74739ba8e4e38221d2c5c03d90a7e542cb8ad681915f4ca8f68d04f810ee0a87", size = 215994, upload-time = "2025-06-09T23:02:17.9Z" }, - { url = "https://files.pythonhosted.org/packages/45/d2/263fea1f658b8ad648c7d94d18a87bca7e8c67bd6a1bbf5445b1bd5b158c/frozenlist-1.7.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e63344c4e929b1a01e29bc184bbb5fd82954869033765bfe8d65d09e336a677", size = 225122, upload-time = "2025-06-09T23:02:19.479Z" }, - { url = "https://files.pythonhosted.org/packages/7b/22/7145e35d12fb368d92124f679bea87309495e2e9ddf14c6533990cb69218/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2ea2a7369eb76de2217a842f22087913cdf75f63cf1307b9024ab82dfb525938", size = 224019, upload-time = "2025-06-09T23:02:20.969Z" }, - { url = "https://files.pythonhosted.org/packages/44/1e/7dae8c54301beb87bcafc6144b9a103bfd2c8f38078c7902984c9a0c4e5b/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:836b42f472a0e006e02499cef9352ce8097f33df43baaba3e0a28a964c26c7d2", size = 239925, upload-time = "2025-06-09T23:02:22.466Z" }, - { url = "https://files.pythonhosted.org/packages/4b/1e/99c93e54aa382e949a98976a73b9b20c3aae6d9d893f31bbe4991f64e3a8/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e22b9a99741294b2571667c07d9f8cceec07cb92aae5ccda39ea1b6052ed4319", size = 220881, upload-time = "2025-06-09T23:02:24.521Z" }, - { url = "https://files.pythonhosted.org/packages/5e/9c/ca5105fa7fb5abdfa8837581be790447ae051da75d32f25c8f81082ffc45/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:9a19e85cc503d958abe5218953df722748d87172f71b73cf3c9257a91b999890", size = 234046, upload-time = "2025-06-09T23:02:26.206Z" }, - { url = "https://files.pythonhosted.org/packages/8d/4d/e99014756093b4ddbb67fb8f0df11fe7a415760d69ace98e2ac6d5d43402/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f22dac33bb3ee8fe3e013aa7b91dc12f60d61d05b7fe32191ffa84c3aafe77bd", size = 235756, upload-time = "2025-06-09T23:02:27.79Z" }, - { url = "https://files.pythonhosted.org/packages/8b/72/a19a40bcdaa28a51add2aaa3a1a294ec357f36f27bd836a012e070c5e8a5/frozenlist-1.7.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9ccec739a99e4ccf664ea0775149f2749b8a6418eb5b8384b4dc0a7d15d304cb", size = 222894, upload-time = "2025-06-09T23:02:29.848Z" }, - { url = "https://files.pythonhosted.org/packages/08/49/0042469993e023a758af81db68c76907cd29e847d772334d4d201cbe9a42/frozenlist-1.7.0-cp39-cp39-win32.whl", hash = "sha256:b3950f11058310008a87757f3eee16a8e1ca97979833239439586857bc25482e", size = 39848, upload-time = "2025-06-09T23:02:31.413Z" }, - { url = "https://files.pythonhosted.org/packages/5a/45/827d86ee475c877f5f766fbc23fb6acb6fada9e52f1c9720e2ba3eae32da/frozenlist-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:43a82fce6769c70f2f5a06248b614a7d268080a9d20f7457ef10ecee5af82b63", size = 44102, upload-time = "2025-06-09T23:02:32.808Z" }, { url = "https://files.pythonhosted.org/packages/ee/45/b82e3c16be2182bff01179db177fe144d58b5dc787a7d4492c6ed8b9317f/frozenlist-1.7.0-py3-none-any.whl", hash = "sha256:9a5af342e34f7e97caf8c995864c7a396418ae2859cc6fdf1b1073020d516a7e", size = 13106, upload-time = "2025-06-09T23:02:34.204Z" }, ] @@ -693,32 +609,13 @@ wheels = [ [package.optional-dependencies] http = [ - { name = "aiohttp", marker = "python_full_version >= '3.10'" }, -] - -[[package]] -name = "gast" -version = "0.4.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/83/4a/07c7e59cef23fb147454663c3271c21da68ba2ab141427c20548ae5a8a4d/gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1", size = 13804, upload-time = "2020-08-07T21:45:23.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/48/583c032b79ae5b3daa02225a675aeb673e58d2cb698e78510feceb11958c/gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4", size = 9824, upload-time = "2020-08-07T21:45:21.32Z" }, + { name = "aiohttp" }, ] [[package]] name = "gast" version = "0.6.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/3c/14/c566f5ca00c115db7725263408ff952b8ae6d6a4e792ef9c84e77d9af7a1/gast-0.6.0.tar.gz", hash = "sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb", size = 27708, upload-time = "2024-06-27T20:31:49.527Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a3/61/8001b38461d751cd1a0c3a6ae84346796a5758123f3ed97a1b121dfbf4f3/gast-0.6.0-py3-none-any.whl", hash = "sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54", size = 21173, upload-time = "2024-07-09T13:15:15.615Z" }, @@ -730,8 +627,7 @@ version = "0.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ @@ -782,15 +678,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/e2/a9923e4c5848ace6e3e6f09a40d3860955f7d836675affe35bc79bc27033/geoarrow_rust_core-0.6.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c67201bd662e4732a822f91651111bc024329b3e71eba9f4eed19e58c9cf789b", size = 4742518, upload-time = "2026-06-11T19:25:48.098Z" }, { url = "https://files.pythonhosted.org/packages/e6/c7/3112def9e93e88341210dd22b4d04c598fb4d0726adef2114b68157354d5/geoarrow_rust_core-0.6.3-cp314-cp314-pyemscripten_2026_0_wasm32.whl", hash = "sha256:8461e6d07a7b39ab099c9885a68d5e7983d4e83a82a42dd5b331c543683c9d6e", size = 1959191, upload-time = "2026-06-11T19:25:49.668Z" }, { url = "https://files.pythonhosted.org/packages/ed/0f/de74ce2171c408e4b4a7660f69f6dfaa294797a18a209fa85b1ea79be141/geoarrow_rust_core-0.6.3-cp314-cp314-win_amd64.whl", hash = "sha256:5d2fd45d09bf700e0ca4d30b51ebcd59fb8d1a9eb4a4d7b4fc5f53a6cca59475", size = 3603948, upload-time = "2026-06-11T19:25:51.078Z" }, - { url = "https://files.pythonhosted.org/packages/58/33/fff80f597a0efb30816e8acf153fa0751891b22abec13cb085cf5d4c48fa/geoarrow_rust_core-0.6.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:77a90e435db4ca0deceb3b239dd9ee4302ce19dbddfaa4bec2fad69a32d8a519", size = 3859560, upload-time = "2026-06-11T19:25:52.482Z" }, - { url = "https://files.pythonhosted.org/packages/38/44/e7b02d661718b49c7b1fa609d2efc5b61276cb0817aeb5cd6a3f50f4834b/geoarrow_rust_core-0.6.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0699909bba568e57a27ce39fa4bfa4dfdfec510aa97e17c1a62082bcfabd8fcf", size = 3713382, upload-time = "2026-06-11T19:25:54.405Z" }, - { url = "https://files.pythonhosted.org/packages/b6/89/877059911db5e119d9cb0237ace2ef5ce70452877d6ac684f25b827ea007/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:720b80031efc2a356727b179d0676c7b34539459806e5fd680121364eb226b91", size = 4201145, upload-time = "2026-06-11T19:25:55.783Z" }, - { url = "https://files.pythonhosted.org/packages/50/e2/9540486e62e1aab7737c103b08caf87fe3337d627a1e1c278a476839495a/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34543b11f9a383256fc5a98745d298a451bb6a0325a0d28156c2ede95a205ac6", size = 4271238, upload-time = "2026-06-11T19:25:57.333Z" }, - { url = "https://files.pythonhosted.org/packages/a2/0a/58b80698ac176834c718c383f9650fdefe517586fc044973e9243da6980f/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fdcad76f4ee70e6c0c3fa25efc106af0410fd33aea9e3291c0c54d7adee19a5d", size = 5607074, upload-time = "2026-06-11T19:25:58.868Z" }, - { url = "https://files.pythonhosted.org/packages/45/cb/7c9af6e1dc21ab73fcb9ef08f6ff8d081bd9aa05d6011743adff51e15cf1/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:181b0d39f1ac2974abd697b5bcd75a4cb5b787b94746d6436311a79e5b1c94ea", size = 4416546, upload-time = "2026-06-11T19:26:00.596Z" }, - { url = "https://files.pythonhosted.org/packages/00/4c/a80882daf7fafa515f103e2a9504f2f86c3f044efe195ab7f2b870ee95d6/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:060a70669ed6363aa267c09c386af9eb641435832ea6abb64191826dcc4bb162", size = 4253545, upload-time = "2026-06-11T19:26:02.172Z" }, - { url = "https://files.pythonhosted.org/packages/12/14/2c63cffe79b5988e91ef90d1fd149270496de9ec4f5106a887d254787f4f/geoarrow_rust_core-0.6.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e569b79e49c52898c6f7fc134d9704d9b24c0609fc9ce5ed9ed03eb80f510862", size = 4749440, upload-time = "2026-06-11T19:26:03.708Z" }, - { url = "https://files.pythonhosted.org/packages/6c/53/3d0c8f94fcd09e44af707716e10a64e30c855acaa38093468fb9518f1f83/geoarrow_rust_core-0.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:0457ce64df184727fdcad581ddb32004947588dd1495ea1133cf786f958a1197", size = 3604831, upload-time = "2026-06-11T19:26:05.088Z" }, ] [[package]] @@ -799,8 +686,7 @@ version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "arro3-core" }, - { name = "pyproj", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "pyproj", version = "3.7.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyproj", version = "3.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ @@ -844,14 +730,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/14/1ec1ba4df851b477d802285e8b770f65e6774f0d6272e4e8548c8758892c/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a10e67d95a134dbb5f657fe3436ea645c6760a4ffef44df211f7d9b8fb687e6", size = 10499137, upload-time = "2025-12-03T19:02:24.514Z" }, { url = "https://files.pythonhosted.org/packages/a5/66/7ad618415790671664e76596c000e812e0bd39e8f347f4eb7b8e3f519a55/geoarrow_rust_io-0.6.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:61ccbb528bbe4834849c501e5990a4a6f4b87976ca6a22df7859f16760c79590", size = 10394123, upload-time = "2025-12-03T19:02:01.248Z" }, { url = "https://files.pythonhosted.org/packages/43/4b/4520af8c694ca0932f995c91d604837741522bd02b66414fdff4521abc98/geoarrow_rust_io-0.6.1-cp314-cp314-win_amd64.whl", hash = "sha256:aa46f6beda6c267f420ea390f071fadd0161094c1db8d71ad54002c006fe7f21", size = 8989484, upload-time = "2025-12-03T19:02:40.081Z" }, - { url = "https://files.pythonhosted.org/packages/69/87/efadbf1bb9d359f55791f7198cf9aa87f0272be6a2d373f5844f5e59cd1e/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:46e3e41b726b250b44a829ab41489e5008280acb8af8e68001230babf04bafd8", size = 9780411, upload-time = "2025-11-21T02:11:30.128Z" }, - { url = "https://files.pythonhosted.org/packages/95/73/5e108b286b219d3a46042cfa0830e0f075f4addd01f83f7c851a933919ae/geoarrow_rust_io-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb95364b726c34c23fb93ebc9c08b8fa1d52062a4a9c1ac614ff8761a339ba7a", size = 9316307, upload-time = "2025-11-21T02:11:21.195Z" }, - { url = "https://files.pythonhosted.org/packages/06/76/89c387d6d4d303feef328fc9c63df76cea52963e2046f2c092b434fb04a9/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:691a67ef3a5214fb704d1a19d33a9ddf173483c3943056fb965101c19b0edd28", size = 10309182, upload-time = "2025-11-21T02:10:34.063Z" }, - { url = "https://files.pythonhosted.org/packages/ff/08/34ed2d76ebfb34ed6bf3312defad16b2b5246e40d59e46443a6fe19e85dd/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91c82e9cbae6759798a8e4a87adb13ea617090a5498f384fc56c44775653d7f0", size = 11291230, upload-time = "2025-11-21T02:10:57.771Z" }, - { url = "https://files.pythonhosted.org/packages/e9/f5/9c25512c1f31101125555367e55ff28f72f449c8f56ff06c5be9e3feb9e5/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9c2b609addc7a810eab5cd573243710d95afe8486f829edd05b311d51bbb5af", size = 13300664, upload-time = "2025-11-21T02:10:46.082Z" }, - { url = "https://files.pythonhosted.org/packages/f5/aa/14be165b439d3a3ffc6ced96f971b02df255e86b82c7e1f9f340d35689c3/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6abdc80e130f472f55598543a4bb9ba522d6502a5d80017a952027a9e9c1d1ce", size = 10486589, upload-time = "2025-11-21T02:11:09.681Z" }, - { url = "https://files.pythonhosted.org/packages/5a/df/1c36bae723561785ce47e463f6366a3c52994795a168d7c4ed5e457e9a37/geoarrow_rust_io-0.6.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c4638a89d61629110dde474b3d410ee2e71c89d2035ab2f2557857e7eee4ea30", size = 10395106, upload-time = "2025-11-21T02:10:20.832Z" }, - { url = "https://files.pythonhosted.org/packages/47/d4/4e9cffad7647c07a5cd1cce68c97102dd011652168e3e09a2dedc1253a5e/geoarrow_rust_io-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:4811e96b1777fcf12ac2416872407b1e4717f9a59fe5b80ce02b1e9a087d1b5e", size = 8988735, upload-time = "2025-11-21T02:11:39.164Z" }, { url = "https://files.pythonhosted.org/packages/e6/9f/32059400bb853eafe5d37d8c4ae9e48cd9c43820287e435cc1566f42208e/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef94f84ba4efb42d63588241733e1b62bbdb4edeac5513baeb7bfb07db4f204a", size = 10303111, upload-time = "2025-11-21T02:10:36.067Z" }, { url = "https://files.pythonhosted.org/packages/6c/a2/7db0a685eafa41e9565a3c4e441f41d2630c084f616d2669c5fe8f5805ef/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:872dd92c52b2df342d34ac42d1b710c91c58e9dd93f5c88098816f9cd9dc8a84", size = 11299498, upload-time = "2025-11-21T02:11:00.19Z" }, { url = "https://files.pythonhosted.org/packages/13/b4/1bfbfbe828ca51b4f314d9f70514c2ff19923714aa7d51ef1b0ec8600aed/geoarrow_rust_io-0.6.1-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:235a7ea94faa95a4699f6577765a5e5a88bee079828c3d9015d9d5c6c240459c", size = 13299230, upload-time = "2025-11-21T02:10:48.12Z" }, @@ -915,13 +793,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" }, { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" }, { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" }, - { url = "https://files.pythonhosted.org/packages/8f/e2/33efd823a879dc7b60c10192df1900ee5c200f8e782663a41a3b2aecd143/grpcio-1.75.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:c09fba33327c3ac11b5c33dbdd8218eef8990d78f83b1656d628831812a8c0fb", size = 5706679, upload-time = "2025-09-26T09:03:10.218Z" }, - { url = "https://files.pythonhosted.org/packages/77/90/b80e75f8cce758425b2772742eed4e9db765a965d902ba4b7f239b2513de/grpcio-1.75.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c12121e509b9f8b0914d10054d24120237d19e870b1cd82acbb8a9b9ddd198a3", size = 6291926, upload-time = "2025-09-26T09:03:16.282Z" }, - { url = "https://files.pythonhosted.org/packages/40/5f/e6033d8f99063350e20873a46225468b73045b9ef2c8cba73d66a87c3fd5/grpcio-1.75.1-cp39-cp39-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:73577a93e692b3474b1bfe84285d098de36705dbd838bb4d6a056d326e4dc880", size = 6950040, upload-time = "2025-09-26T09:03:18.874Z" }, - { url = "https://files.pythonhosted.org/packages/01/12/34076c079b45af5aed40f037fffe388d7fbe90dd539ed01e4744c926d227/grpcio-1.75.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e19e7dfa0d7ca7dea22be464339e18ac608fd75d88c56770c646cdabe54bc724", size = 6465780, upload-time = "2025-09-26T09:03:21.219Z" }, - { url = "https://files.pythonhosted.org/packages/e4/c5/ee6fd69a9f6e7288d04da010ad7480a0566d2aac81097ff4dafbc5ffa9b6/grpcio-1.75.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4e1c28f51c1cf67eccdfc1065e8e866c9ed622f09773ca60947089c117f848a1", size = 7098308, upload-time = "2025-09-26T09:03:23.875Z" }, - { url = "https://files.pythonhosted.org/packages/78/32/f2be13f13035361768923159fe20470a7d22db2c7c692b952e21284f56e5/grpcio-1.75.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:030a6164bc2ca726052778c0cf8e3249617a34e368354f9e6107c27ad4af8c28", size = 8042268, upload-time = "2025-09-26T09:03:26.268Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2d/1bb0572f0a2eaab100b4635c6c2cd0d37e3cda5554037e3f90b1bc428d56/grpcio-1.75.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:67697efef5a98d46d5db7b1720fa4043536f8b8e5072a5d61cfca762f287e939", size = 7491470, upload-time = "2025-09-26T09:03:28.906Z" }, ] [[package]] @@ -929,8 +800,7 @@ name = "h5py" version = "3.14.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5d/57/dfb3c5c3f1bf5f5ef2e59a22dec4ff1f3d7408b55bfcefcfb0ea69ef21c6/h5py-3.14.0.tar.gz", hash = "sha256:2372116b2e0d5d3e5e705b7f663f7c8d96fa79a4052d250484ef91d24d6a08f4", size = 424323, upload-time = "2025-06-06T14:06:15.01Z" } @@ -943,8 +813,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/86/f9/f00de11c82c88bfc1ef22633557bfba9e271e0cb3189ad704183fc4a2644/h5py-3.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cbd41f4e3761f150aa5b662df991868ca533872c95467216f2bec5fcad84882", size = 4929422, upload-time = "2025-06-06T14:05:18.399Z" }, { url = "https://files.pythonhosted.org/packages/0d/ce/3a21d87896bc7e3e9255e0ad5583ae31ae9e6b4b00e0bcb2a67e2b6acdbc/h5py-3.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8cbaf6910fa3983c46172666b0b8da7b7bd90d764399ca983236f2400436eeb", size = 4700675, upload-time = "2025-06-06T14:05:37.38Z" }, { url = "https://files.pythonhosted.org/packages/e7/ec/86f59025306dcc6deee5fda54d980d077075b8d9889aac80f158bd585f1b/h5py-3.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d90e6445ab7c146d7f7981b11895d70bc1dd91278a4f9f9028bc0c95e4a53f13", size = 4921632, upload-time = "2025-06-06T14:05:43.464Z" }, - { url = "https://files.pythonhosted.org/packages/66/40/b423b57696514e05aa7bb06150ef96667d0e0006cc6de7ab52c71734ab51/h5py-3.14.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:573c33ad056ac7c1ab6d567b6db9df3ffc401045e3f605736218f96c1e0490c6", size = 4326368, upload-time = "2025-06-06T14:06:00.782Z" }, - { url = "https://files.pythonhosted.org/packages/f7/07/e088f89f04fdbe57ddf9de377f857158d3daa38cf5d0fb20ef9bd489e313/h5py-3.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccbe17dc187c0c64178f1a10aa274ed3a57d055117588942b8a08793cc448216", size = 4559686, upload-time = "2025-06-06T14:06:07.416Z" }, ] [[package]] @@ -967,14 +835,14 @@ name = "huggingface-hub" version = "0.35.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "python_full_version >= '3.10'" }, - { name = "fsspec", marker = "python_full_version >= '3.10'" }, - { name = "hf-xet", marker = "(python_full_version >= '3.10' and platform_machine == 'aarch64') or (python_full_version >= '3.10' and platform_machine == 'amd64') or (python_full_version >= '3.10' and platform_machine == 'arm64') or (python_full_version >= '3.10' and platform_machine == 'x86_64')" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "pyyaml", marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "tqdm", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "filelock" }, + { name = "fsspec" }, + { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "tqdm" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } wheels = [ @@ -990,18 +858,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] -[[package]] -name = "importlib-metadata" -version = "8.7.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "zipp", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, -] - [[package]] name = "iniconfig" version = "2.1.0" @@ -1032,57 +888,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256, upload-time = "2022-06-17T18:00:10.251Z" }, ] -[[package]] -name = "keras" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/8b/065f94ba03282fa41b2d76942b87a180a9913312c4611ea7d6508fbbc114/keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248", size = 1332171, upload-time = "2021-11-03T16:16:34.318Z" }, -] - [[package]] name = "keras" version = "3.11.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "namex", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "h5py" }, + { name = "ml-dtypes" }, + { name = "namex" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "optree", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "rich", marker = "python_full_version >= '3.10'" }, + { name = "optree" }, + { name = "packaging" }, + { name = "rich" }, ] sdist = { url = "https://files.pythonhosted.org/packages/6a/89/646425fe9a46f9053430e1271f817c36041c6f33469950a3caafc3d2591e/keras-3.11.3.tar.gz", hash = "sha256:efda616835c31b7d916d72303ef9adec1257320bc9fd4b2b0138840fc65fb5b7", size = 1065906, upload-time = "2025-08-21T22:08:57.643Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/94/5b/4c778cc921ce4b864b238f63f8e3ff6e954ab19b80c9fa680593ad8093d4/keras-3.11.3-py3-none-any.whl", hash = "sha256:f484f050e05ee400455b05ec8c36ed35edc34de94256b6073f56cfe68f65491f", size = 1408438, upload-time = "2025-08-21T22:08:55.858Z" }, ] -[[package]] -name = "keras-preprocessing" -version = "1.1.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5e/f1/b44337faca48874333769a29398fe4666686733c8880aa160b9fd5dfe600/Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3", size = 163598, upload-time = "2020-05-14T03:53:48.526Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/79/4c/7c3275a01e12ef9368a892926ab932b33bb13d55794881e3573482b378a7/Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b", size = 42581, upload-time = "2020-05-14T03:53:47.192Z" }, -] - [[package]] name = "lance-namespace" version = "0.8.6" @@ -1103,8 +928,7 @@ dependencies = [ { name = "pydantic" }, { name = "python-dateutil" }, { name = "typing-extensions" }, - { name = "urllib3", version = "1.26.20", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c7/80/fb224b4a89c1c1638cde949cb6cce6c3aca7759effbfea46a3d9c3960b21/lance_namespace_urllib3_client-0.8.6.tar.gz", hash = "sha256:b6fb1d306e74a7576e5309919020be744527de484a63dbf5eed10f8b368548df", size = 228772, upload-time = "2026-06-12T17:36:42.609Z" } wheels = [ @@ -1127,9 +951,6 @@ wheels = [ name = "markdown" version = "3.9" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, -] sdist = { url = "https://files.pythonhosted.org/packages/8d/37/02347f6d6d8279247a5837082ebc26fc0d5aaeaf75aa013fcbb433c777ab/markdown-3.9.tar.gz", hash = "sha256:d2900fe1782bd33bdbbd56859defef70c2e78fc46668f8eb9df3128138f2cb6a", size = 364585, upload-time = "2025-09-04T20:25:22.885Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/70/ae/44c4a6a4cbb496d93c6257954260fe3a6e91b7bed2240e5dad2a717f5111/markdown-3.9-py3-none-any.whl", hash = "sha256:9f4d91ed810864ea88a6f32c07ba8bee1346c0cc1f6b1f9f6c822f2a9667d280", size = 107441, upload-time = "2025-09-04T20:25:21.784Z" }, @@ -1140,7 +961,7 @@ name = "markdown-it-py" version = "4.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "mdurl", marker = "python_full_version >= '3.10'" }, + { name = "mdurl" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } wheels = [ @@ -1230,17 +1051,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" }, { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" }, { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, - { url = "https://files.pythonhosted.org/packages/56/23/0d8c13a44bde9154821586520840643467aee574d8ce79a17da539ee7fed/markupsafe-3.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:15d939a21d546304880945ca1ecb8a039db6b4dc49b2c5a400387cdae6a62e26", size = 11623, upload-time = "2025-09-27T18:37:29.296Z" }, - { url = "https://files.pythonhosted.org/packages/fd/23/07a2cb9a8045d5f3f0890a8c3bc0859d7a47bfd9a560b563899bec7b72ed/markupsafe-3.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f71a396b3bf33ecaa1626c255855702aca4d3d9fea5e051b41ac59a9c1c41edc", size = 12049, upload-time = "2025-09-27T18:37:30.234Z" }, - { url = "https://files.pythonhosted.org/packages/bc/e4/6be85eb81503f8e11b61c0b6369b6e077dcf0a74adbd9ebf6b349937b4e9/markupsafe-3.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f4b68347f8c5eab4a13419215bdfd7f8c9b19f2b25520968adfad23eb0ce60c", size = 21923, upload-time = "2025-09-27T18:37:31.177Z" }, - { url = "https://files.pythonhosted.org/packages/6f/bc/4dc914ead3fe6ddaef035341fee0fc956949bbd27335b611829292b89ee2/markupsafe-3.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8fc20152abba6b83724d7ff268c249fa196d8259ff481f3b1476383f8f24e42", size = 20543, upload-time = "2025-09-27T18:37:32.168Z" }, - { url = "https://files.pythonhosted.org/packages/89/6e/5fe81fbcfba4aef4093d5f856e5c774ec2057946052d18d168219b7bd9f9/markupsafe-3.0.3-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:949b8d66bc381ee8b007cd945914c721d9aba8e27f71959d750a46f7c282b20b", size = 20585, upload-time = "2025-09-27T18:37:33.166Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f6/e0e5a3d3ae9c4020f696cd055f940ef86b64fe88de26f3a0308b9d3d048c/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:3537e01efc9d4dccdf77221fb1cb3b8e1a38d5428920e0657ce299b20324d758", size = 21387, upload-time = "2025-09-27T18:37:34.185Z" }, - { url = "https://files.pythonhosted.org/packages/c8/25/651753ef4dea08ea790f4fbb65146a9a44a014986996ca40102e237aa49a/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:591ae9f2a647529ca990bc681daebdd52c8791ff06c2bfa05b65163e28102ef2", size = 20133, upload-time = "2025-09-27T18:37:35.138Z" }, - { url = "https://files.pythonhosted.org/packages/dc/0a/c3cf2b4fef5f0426e8a6d7fce3cb966a17817c568ce59d76b92a233fdbec/markupsafe-3.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a320721ab5a1aba0a233739394eb907f8c8da5c98c9181d1161e77a0c8e36f2d", size = 20588, upload-time = "2025-09-27T18:37:36.096Z" }, - { url = "https://files.pythonhosted.org/packages/cd/1b/a7782984844bd519ad4ffdbebbba2671ec5d0ebbeac34736c15fb86399e8/markupsafe-3.0.3-cp39-cp39-win32.whl", hash = "sha256:df2449253ef108a379b8b5d6b43f4b1a8e81a061d6537becd5582fba5f9196d7", size = 14566, upload-time = "2025-09-27T18:37:37.09Z" }, - { url = "https://files.pythonhosted.org/packages/18/1f/8d9c20e1c9440e215a44be5ab64359e207fcb4f675543f1cf9a2a7f648d0/markupsafe-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:7c3fb7d25180895632e5d3148dbdc29ea38ccb7fd210aa27acbd1201a1902c6e", size = 15053, upload-time = "2025-09-27T18:37:38.054Z" }, - { url = "https://files.pythonhosted.org/packages/4e/d3/fe08482b5cd995033556d45041a4f4e76e7f0521112a9c9991d40d39825f/markupsafe-3.0.3-cp39-cp39-win_arm64.whl", hash = "sha256:38664109c14ffc9e7437e86b4dceb442b0096dfe3541d7864d9cbe1da4cf36c8", size = 13928, upload-time = "2025-09-27T18:37:39.037Z" }, ] [[package]] @@ -1281,8 +1091,7 @@ name = "ml-dtypes" version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" } @@ -1317,10 +1126,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" }, { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" }, { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" }, - { url = "https://files.pythonhosted.org/packages/19/2d/c61af51173083bbf2a3b0f1a1a01d50ef1830436880027433d1b75271083/ml_dtypes-0.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5ee72568d46b9533ad54f78b1e1f3067c0534c5065120ea8ecc6f210d22748b3", size = 663552, upload-time = "2025-07-29T18:39:13.102Z" }, - { url = "https://files.pythonhosted.org/packages/61/0e/a628f2aefd719745e8a13492375a55cedea77c0cfc917b1ce11bde435c68/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01de48de4537dc3c46e684b969a40ec36594e7eeb7c69e9a093e7239f030a28a", size = 4952704, upload-time = "2025-07-29T18:39:14.829Z" }, - { url = "https://files.pythonhosted.org/packages/f8/2e/5ba92f1f99d1f5f62bffec614a5b8161e55c3961257c902fa26dbe909baa/ml_dtypes-0.5.3-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b1a6e231b0770f2894910f1dce6d2f31d65884dbf7668f9b08d73623cdca909", size = 4923538, upload-time = "2025-07-29T18:39:16.581Z" }, - { url = "https://files.pythonhosted.org/packages/70/3b/f801c69027866ea6e387224551185fedef62ad8e2e71181ec0d9dda905f7/ml_dtypes-0.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:a4f39b9bf6555fab9bfb536cf5fdd1c1c727e8d22312078702e9ff005354b37f", size = 206567, upload-time = "2025-07-29T18:39:18.047Z" }, ] [[package]] @@ -1337,7 +1142,7 @@ name = "multidict" version = "6.6.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version == '3.10.*'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/69/7f/0652e6ed47ab288e3756ea9c0df8b14950781184d4bd7883f4d87dd41245/multidict-6.6.4.tar.gz", hash = "sha256:d2d4e4787672911b48350df02ed3fa3fffdc2f2e8ca06dd6afdf34189b76a9dd", size = 101843, upload-time = "2025-08-11T12:08:48.217Z" } wheels = [ @@ -1431,24 +1236,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/b0/a6fae46071b645ae98786ab738447de1ef53742eaad949f27e960864bb49/multidict-6.6.4-cp313-cp313t-win32.whl", hash = "sha256:f93b2b2279883d1d0a9e1bd01f312d6fc315c5e4c1f09e112e4736e2f650bc4e", size = 47775, upload-time = "2025-08-11T12:08:12.439Z" }, { url = "https://files.pythonhosted.org/packages/b2/0a/2436550b1520091af0600dff547913cb2d66fbac27a8c33bc1b1bccd8d98/multidict-6.6.4-cp313-cp313t-win_amd64.whl", hash = "sha256:6d46a180acdf6e87cc41dc15d8f5c2986e1e8739dc25dbb7dac826731ef381a4", size = 53100, upload-time = "2025-08-11T12:08:13.823Z" }, { url = "https://files.pythonhosted.org/packages/97/ea/43ac51faff934086db9c072a94d327d71b7d8b40cd5dcb47311330929ef0/multidict-6.6.4-cp313-cp313t-win_arm64.whl", hash = "sha256:756989334015e3335d087a27331659820d53ba432befdef6a718398b0a8493ad", size = 45501, upload-time = "2025-08-11T12:08:15.173Z" }, - { url = "https://files.pythonhosted.org/packages/d4/d3/f04c5db316caee9b5b2cbba66270b358c922a959855995bedde87134287c/multidict-6.6.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:af7618b591bae552b40dbb6f93f5518328a949dac626ee75927bba1ecdeea9f4", size = 76977, upload-time = "2025-08-11T12:08:16.667Z" }, - { url = "https://files.pythonhosted.org/packages/70/39/a6200417d883e510728ab3caec02d3b66ff09e1c85e0aab2ba311abfdf06/multidict-6.6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b6819f83aef06f560cb15482d619d0e623ce9bf155115150a85ab11b8342a665", size = 44878, upload-time = "2025-08-11T12:08:18.157Z" }, - { url = "https://files.pythonhosted.org/packages/6f/7e/815be31ed35571b137d65232816f61513fcd97b2717d6a9d7800b5a0c6e0/multidict-6.6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4d09384e75788861e046330308e7af54dd306aaf20eb760eb1d0de26b2bea2cb", size = 44546, upload-time = "2025-08-11T12:08:19.694Z" }, - { url = "https://files.pythonhosted.org/packages/e2/f1/21b5bff6a8c3e2aff56956c241941ace6b8820e1abe6b12d3c52868a773d/multidict-6.6.4-cp39-cp39-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:a59c63061f1a07b861c004e53869eb1211ffd1a4acbca330e3322efa6dd02978", size = 223020, upload-time = "2025-08-11T12:08:21.554Z" }, - { url = "https://files.pythonhosted.org/packages/15/59/37083f1dd3439979a0ffeb1906818d978d88b4cc7f4600a9f89b1cb6713c/multidict-6.6.4-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:350f6b0fe1ced61e778037fdc7613f4051c8baf64b1ee19371b42a3acdb016a0", size = 240528, upload-time = "2025-08-11T12:08:23.45Z" }, - { url = "https://files.pythonhosted.org/packages/d1/f0/f054d123c87784307a27324c829eb55bcfd2e261eb785fcabbd832c8dc4a/multidict-6.6.4-cp39-cp39-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0c5cbac6b55ad69cb6aa17ee9343dfbba903118fd530348c330211dc7aa756d1", size = 219540, upload-time = "2025-08-11T12:08:24.965Z" }, - { url = "https://files.pythonhosted.org/packages/e8/26/8f78ce17b7118149c17f238f28fba2a850b660b860f9b024a34d0191030f/multidict-6.6.4-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:630f70c32b8066ddfd920350bc236225814ad94dfa493fe1910ee17fe4365cbb", size = 251182, upload-time = "2025-08-11T12:08:26.511Z" }, - { url = "https://files.pythonhosted.org/packages/00/c3/a21466322d69f6594fe22d9379200f99194d21c12a5bbf8c2a39a46b83b6/multidict-6.6.4-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f8d4916a81697faec6cb724a273bd5457e4c6c43d82b29f9dc02c5542fd21fc9", size = 249371, upload-time = "2025-08-11T12:08:28.075Z" }, - { url = "https://files.pythonhosted.org/packages/c2/8e/2e673124eb05cf8dc82e9265eccde01a36bcbd3193e27799b8377123c976/multidict-6.6.4-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e42332cf8276bb7645d310cdecca93a16920256a5b01bebf747365f86a1675b", size = 239235, upload-time = "2025-08-11T12:08:29.937Z" }, - { url = "https://files.pythonhosted.org/packages/2b/2d/bdd9f05e7c89e30a4b0e4faf0681a30748f8d1310f68cfdc0e3571e75bd5/multidict-6.6.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:f3be27440f7644ab9a13a6fc86f09cdd90b347c3c5e30c6d6d860de822d7cb53", size = 237410, upload-time = "2025-08-11T12:08:31.872Z" }, - { url = "https://files.pythonhosted.org/packages/46/4c/3237b83f8ca9a2673bb08fc340c15da005a80f5cc49748b587c8ae83823b/multidict-6.6.4-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:21f216669109e02ef3e2415ede07f4f8987f00de8cdfa0cc0b3440d42534f9f0", size = 232979, upload-time = "2025-08-11T12:08:33.399Z" }, - { url = "https://files.pythonhosted.org/packages/55/a6/a765decff625ae9bc581aed303cd1837955177dafc558859a69f56f56ba8/multidict-6.6.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:d9890d68c45d1aeac5178ded1d1cccf3bc8d7accf1f976f79bf63099fb16e4bd", size = 240979, upload-time = "2025-08-11T12:08:35.02Z" }, - { url = "https://files.pythonhosted.org/packages/6b/2d/9c75975cb0c66ea33cae1443bb265b2b3cd689bffcbc68872565f401da23/multidict-6.6.4-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:edfdcae97cdc5d1a89477c436b61f472c4d40971774ac4729c613b4b133163cb", size = 246849, upload-time = "2025-08-11T12:08:37.038Z" }, - { url = "https://files.pythonhosted.org/packages/3e/71/d21ac0843c1d8751fb5dcf8a1f436625d39d4577bc27829799d09b419af7/multidict-6.6.4-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0b2e886624be5773e69cf32bcb8534aecdeb38943520b240fed3d5596a430f2f", size = 241798, upload-time = "2025-08-11T12:08:38.669Z" }, - { url = "https://files.pythonhosted.org/packages/94/3d/1d8911e53092837bd11b1c99d71de3e2a9a26f8911f864554677663242aa/multidict-6.6.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:be5bf4b3224948032a845d12ab0f69f208293742df96dc14c4ff9b09e508fc17", size = 235315, upload-time = "2025-08-11T12:08:40.266Z" }, - { url = "https://files.pythonhosted.org/packages/86/c5/4b758df96376f73e936b1942c6c2dfc17e37ed9d5ff3b01a811496966ca0/multidict-6.6.4-cp39-cp39-win32.whl", hash = "sha256:10a68a9191f284fe9d501fef4efe93226e74df92ce7a24e301371293bd4918ae", size = 41434, upload-time = "2025-08-11T12:08:41.965Z" }, - { url = "https://files.pythonhosted.org/packages/58/16/f1dfa2a0f25f2717a5e9e5fe8fd30613f7fe95e3530cec8d11f5de0b709c/multidict-6.6.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee25f82f53262f9ac93bd7e58e47ea1bdcc3393cef815847e397cba17e284210", size = 46186, upload-time = "2025-08-11T12:08:43.367Z" }, - { url = "https://files.pythonhosted.org/packages/88/7d/a0568bac65438c494cb6950b29f394d875a796a237536ac724879cf710c9/multidict-6.6.4-cp39-cp39-win_arm64.whl", hash = "sha256:f9867e55590e0855bcec60d4f9a092b69476db64573c9fe17e92b0c50614c16a", size = 43115, upload-time = "2025-08-11T12:08:45.126Z" }, { url = "https://files.pythonhosted.org/packages/fd/69/b547032297c7e63ba2af494edba695d781af8a0c6e89e4d06cf848b21d80/multidict-6.6.4-py3-none-any.whl", hash = "sha256:27d8f8e125c07cb954e54d75d04905a9bba8a439c1d84aca94949d4d03d8601c", size = 12313, upload-time = "2025-08-11T12:08:46.891Z" }, ] @@ -1457,14 +1244,12 @@ name = "multiprocess" version = "0.70.16" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "dill", marker = "python_full_version >= '3.10'" }, + { name = "dill" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b5/ae/04f39c5d0d0def03247c2893d6f2b83c136bf3320a2154d7b8858f2ba72d/multiprocess-0.70.16.tar.gz", hash = "sha256:161af703d4652a0e1410be6abccecde4a7ddffd19341be0a7011b94aeb171ac1", size = 1772603, upload-time = "2024-01-28T18:52:34.85Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ef/76/6e712a2623d146d314f17598df5de7224c85c0060ef63fd95cc15a25b3fa/multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee", size = 134980, upload-time = "2024-01-28T18:52:15.731Z" }, { url = "https://files.pythonhosted.org/packages/0f/ab/1e6e8009e380e22254ff539ebe117861e5bdb3bff1fc977920972237c6c7/multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec", size = 134982, upload-time = "2024-01-28T18:52:17.783Z" }, - { url = "https://files.pythonhosted.org/packages/d8/94/8638a89f93c80df329116e6781a060506c7e91e1f4370dc831e9d17a041d/multiprocess-0.70.16-pp39-pypy39_pp73-macosx_10_13_x86_64.whl", hash = "sha256:0dfd078c306e08d46d7a8d06fb120313d87aa43af60d66da43ffff40b44d2f41", size = 133497, upload-time = "2024-01-28T18:52:22.644Z" }, - { url = "https://files.pythonhosted.org/packages/89/21/222066f6bb8d8af287923ae3bd26cf4699a9ce020228ac273caca1de8250/multiprocess-0.70.16-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e7b9d0f307cd9bd50851afaac0dba2cb6c44449efff697df7c7645f7d3f2be3a", size = 133498, upload-time = "2024-01-28T18:52:24.576Z" }, { url = "https://files.pythonhosted.org/packages/bc/f7/7ec7fddc92e50714ea3745631f79bd9c96424cb2702632521028e57d3a36/multiprocess-0.70.16-py310-none-any.whl", hash = "sha256:c4a9944c67bd49f823687463660a2d6daae94c289adff97e0f9d696ba6371d02", size = 134824, upload-time = "2024-01-28T18:52:26.062Z" }, { url = "https://files.pythonhosted.org/packages/50/15/b56e50e8debaf439f44befec5b2af11db85f6e0f344c3113ae0be0593a91/multiprocess-0.70.16-py311-none-any.whl", hash = "sha256:af4cabb0dac72abfb1e794fa7855c325fd2b55a10a44628a3c1ad3311c04127a", size = 143519, upload-time = "2024-01-28T18:52:28.115Z" }, { url = "https://files.pythonhosted.org/packages/0a/7d/a988f258104dcd2ccf1ed40fdc97e26c4ac351eeaf81d76e266c52d84e2f/multiprocess-0.70.16-py312-none-any.whl", hash = "sha256:fc0544c531920dde3b00c29863377f87e1632601092ea2daca74e4beb40faa2e", size = 146741, upload-time = "2024-01-28T18:52:29.395Z" }, @@ -1481,24 +1266,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/bc/465daf1de06409cdd4532082806770ee0d8d7df434da79c76564d0f69741/namex-0.1.0-py3-none-any.whl", hash = "sha256:e2012a474502f1e2251267062aae3114611f07df4224b6e06334c57b0f2ce87c", size = 5905, upload-time = "2025-05-26T23:17:37.695Z" }, ] -[[package]] -name = "networkx" -version = "3.2.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/c4/80/a84676339aaae2f1cfdf9f418701dd634aef9cc76f708ef55c36ff39c3ca/networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6", size = 2073928, upload-time = "2023-10-28T08:41:39.364Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/f0/8fbc882ca80cf077f1b246c0e3c3465f7f415439bdea6b899f6b19f61f70/networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2", size = 1647772, upload-time = "2023-10-28T08:41:36.945Z" }, -] - [[package]] name = "networkx" version = "3.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" } wheels = [ @@ -1529,67 +1302,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] -[[package]] -name = "numpy" -version = "2.0.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" }, - { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" }, - { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" }, - { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" }, - { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" }, - { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" }, - { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" }, - { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" }, - { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" }, - { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" }, - { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" }, - { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" }, - { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" }, - { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" }, - { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" }, - { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" }, - { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" }, - { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" }, - { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" }, - { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" }, - { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" }, - { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" }, - { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ff/ddf6dac2ff0dd50a7327bcdba45cb0264d0e96bb44d33324853f781a8f3c/numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c", size = 6633497, upload-time = "2024-08-26T20:11:55.09Z" }, - { url = "https://files.pythonhosted.org/packages/72/21/67f36eac8e2d2cd652a2e69595a54128297cdcb1ff3931cfc87838874bd4/numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692", size = 13621158, upload-time = "2024-08-26T20:12:14.95Z" }, - { url = "https://files.pythonhosted.org/packages/39/68/e9f1126d757653496dbc096cb429014347a36b228f5a991dae2c6b6cfd40/numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a", size = 19236173, upload-time = "2024-08-26T20:12:44.049Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e9/1f5333281e4ebf483ba1c888b1d61ba7e78d7e910fdd8e6499667041cc35/numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c", size = 19634174, upload-time = "2024-08-26T20:13:13.634Z" }, - { url = "https://files.pythonhosted.org/packages/71/af/a469674070c8d8408384e3012e064299f7a2de540738a8e414dcfd639996/numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded", size = 14099701, upload-time = "2024-08-26T20:13:34.851Z" }, - { url = "https://files.pythonhosted.org/packages/d0/3d/08ea9f239d0e0e939b6ca52ad403c84a2bce1bde301a8eb4888c1c1543f1/numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5", size = 6174313, upload-time = "2024-08-26T20:13:45.653Z" }, - { url = "https://files.pythonhosted.org/packages/b2/b5/4ac39baebf1fdb2e72585c8352c56d063b6126be9fc95bd2bb5ef5770c20/numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a", size = 15606179, upload-time = "2024-08-26T20:14:08.786Z" }, - { url = "https://files.pythonhosted.org/packages/43/c1/41c8f6df3162b0c6ffd4437d729115704bd43363de0090c7f913cfbc2d89/numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c", size = 21169942, upload-time = "2024-08-26T20:14:40.108Z" }, - { url = "https://files.pythonhosted.org/packages/39/bc/fd298f308dcd232b56a4031fd6ddf11c43f9917fbc937e53762f7b5a3bb1/numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd", size = 13711512, upload-time = "2024-08-26T20:15:00.985Z" }, - { url = "https://files.pythonhosted.org/packages/96/ff/06d1aa3eeb1c614eda245c1ba4fb88c483bee6520d361641331872ac4b82/numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b", size = 5306976, upload-time = "2024-08-26T20:15:10.876Z" }, - { url = "https://files.pythonhosted.org/packages/2d/98/121996dcfb10a6087a05e54453e28e58694a7db62c5a5a29cee14c6e047b/numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729", size = 6906494, upload-time = "2024-08-26T20:15:22.055Z" }, - { url = "https://files.pythonhosted.org/packages/15/31/9dffc70da6b9bbf7968f6551967fc21156207366272c2a40b4ed6008dc9b/numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1", size = 13912596, upload-time = "2024-08-26T20:15:42.452Z" }, - { url = "https://files.pythonhosted.org/packages/b9/14/78635daab4b07c0930c919d451b8bf8c164774e6a3413aed04a6d95758ce/numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd", size = 19526099, upload-time = "2024-08-26T20:16:11.048Z" }, - { url = "https://files.pythonhosted.org/packages/26/4c/0eeca4614003077f68bfe7aac8b7496f04221865b3a5e7cb230c9d055afd/numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d", size = 19932823, upload-time = "2024-08-26T20:16:40.171Z" }, - { url = "https://files.pythonhosted.org/packages/f1/46/ea25b98b13dccaebddf1a803f8c748680d972e00507cd9bc6dcdb5aa2ac1/numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d", size = 14404424, upload-time = "2024-08-26T20:17:02.604Z" }, - { url = "https://files.pythonhosted.org/packages/c8/a6/177dd88d95ecf07e722d21008b1b40e681a929eb9e329684d449c36586b2/numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa", size = 6476809, upload-time = "2024-08-26T20:17:13.553Z" }, - { url = "https://files.pythonhosted.org/packages/ea/2b/7fc9f4e7ae5b507c1a3a21f0f15ed03e794c1242ea8a242ac158beb56034/numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73", size = 15911314, upload-time = "2024-08-26T20:17:36.72Z" }, - { url = "https://files.pythonhosted.org/packages/8f/3b/df5a870ac6a3be3a86856ce195ef42eec7ae50d2a202be1f5a4b3b340e14/numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8", size = 21025288, upload-time = "2024-08-26T20:18:07.732Z" }, - { url = "https://files.pythonhosted.org/packages/2c/97/51af92f18d6f6f2d9ad8b482a99fb74e142d71372da5d834b3a2747a446e/numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4", size = 6762793, upload-time = "2024-08-26T20:18:19.125Z" }, - { url = "https://files.pythonhosted.org/packages/12/46/de1fbd0c1b5ccaa7f9a005b66761533e2f6a3e560096682683a223631fe9/numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c", size = 19334885, upload-time = "2024-08-26T20:18:47.237Z" }, - { url = "https://files.pythonhosted.org/packages/cc/dc/d330a6faefd92b446ec0f0dfea4c3207bb1fef3c4771d19cf4543efd2c78/numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385", size = 15828784, upload-time = "2024-08-26T20:19:11.19Z" }, -] - [[package]] name = "numpy" version = "2.2.6" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } wheels = [ @@ -1876,7 +1594,7 @@ name = "optree" version = "0.17.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/56/c7/0853e0c59b135dff770615d2713b547b6b3b5cde7c10995b4a5825244612/optree-0.17.0.tar.gz", hash = "sha256:5335a5ec44479920620d72324c66563bd705ab2a698605dd4b6ee67dbcad7ecd", size = 163111, upload-time = "2025-07-25T11:26:11.586Z" } wheels = [ @@ -1917,11 +1635,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/d3/8819a2d5105a240d6793d11a61d597db91756ce84da5cee08808c6b8f61f/optree-0.17.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:875c017890a4b5d566af5593cab67fe3c4845544942af57e6bb9dea17e060297", size = 439080, upload-time = "2025-07-25T11:25:42.605Z" }, { url = "https://files.pythonhosted.org/packages/c6/ef/9dbd34dfd1ad89feb239ca9925897a14ac94f190379a3bd991afdfd94186/optree-0.17.0-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ffa5686191139f763e13445a169765c83517164bc28e60dbedb19bed2b2655f1", size = 439422, upload-time = "2025-07-25T11:25:43.672Z" }, { url = "https://files.pythonhosted.org/packages/86/ca/a7a7549af2951925a692df508902ed2a6a94a51bc846806d2281b1029ef9/optree-0.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:575cf48cc2190acb565bd2b26b6f9b15c4e3b60183e86031215badc9d5441345", size = 426579, upload-time = "2025-07-25T11:25:44.765Z" }, - { url = "https://files.pythonhosted.org/packages/1d/29/3bb53de2de3b36a51e46b6d9ada7ee1a3a312ac461cd54292a023adc807c/optree-0.17.0-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:537498cf7bf7a4fe71f7ffd815e72b8672aea0fac82e1513f6b6e35e8569f5aa", size = 350302, upload-time = "2025-07-25T11:25:52.016Z" }, - { url = "https://files.pythonhosted.org/packages/2b/3b/d17a31447ed7ef6f10bd0caf40742b016fcdeaa3abb7568307b04a0f50cf/optree-0.17.0-cp39-cp39-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3b3bb2326b550ddb048e3454fad40183b7fed74dda4351b016d20362809180af", size = 405358, upload-time = "2025-07-25T11:25:53.085Z" }, - { url = "https://files.pythonhosted.org/packages/db/f3/b9f0a8c98fd0c7f53fa9d9a46d75bb1182aeecd7ecde6f353d3e69ec9618/optree-0.17.0-cp39-cp39-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c0d3d702044e5acbec2cf8349789f6b096057bd00dc8e1e1c97b990347279fda", size = 402694, upload-time = "2025-07-25T11:25:54.537Z" }, - { url = "https://files.pythonhosted.org/packages/cb/dd/0d9d7426fd6b5d90ad40e4d93717a955d4257d06574dfe7a1da0d24cb06c/optree-0.17.0-cp39-cp39-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a9155e82717be1dda1f3c1244e9cb5b3733d5dd3ba47702730c7816be083a5cb", size = 398857, upload-time = "2025-07-25T11:25:55.921Z" }, - { url = "https://files.pythonhosted.org/packages/d8/57/dacec3f8c70f4685bb07fce19cf3361037fde2b596f6f7228e1a4b39677b/optree-0.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8e825501f55360e8381718623b094579dedc485e57010e01593d72a43b43e68", size = 387849, upload-time = "2025-07-25T11:25:57.046Z" }, { url = "https://files.pythonhosted.org/packages/ed/d7/3036d15c028c447b1bd65dcf8f66cfd775bfa4e52daa74b82fb1d3c88faf/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adde1427e0982cfc5f56939c26b4ebbd833091a176734c79fb95c78bdf833dff", size = 350952, upload-time = "2025-07-25T11:26:02.692Z" }, { url = "https://files.pythonhosted.org/packages/71/45/e710024ef77324e745de48efd64f6270d8c209f14107a48ffef4049ac57a/optree-0.17.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a80b7e5de5dd09b9c8b62d501e29a3850b047565c336c9d004b07ee1c01f4ae1", size = 389568, upload-time = "2025-07-25T11:26:04.094Z" }, { url = "https://files.pythonhosted.org/packages/69/c4/94a187ed3ca71194b9da6a276790e1703c7544c8f695ac915214ae8ce934/optree-0.17.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f87f6f39015fc82d7adeee19900d246b89911319726e93cb2dbd4d1a809899bd", size = 363728, upload-time = "2025-07-25T11:26:07.959Z" }, @@ -1942,8 +1655,7 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "python-dateutil" }, { name = "pytz" }, @@ -1998,13 +1710,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" }, { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" }, { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, - { url = "https://files.pythonhosted.org/packages/56/b4/52eeb530a99e2a4c55ffcd352772b599ed4473a0f892d127f4147cf0f88e/pandas-2.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2", size = 11567720, upload-time = "2025-09-29T23:33:06.209Z" }, - { url = "https://files.pythonhosted.org/packages/48/4a/2d8b67632a021bced649ba940455ed441ca854e57d6e7658a6024587b083/pandas-2.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8", size = 10810302, upload-time = "2025-09-29T23:33:35.846Z" }, - { url = "https://files.pythonhosted.org/packages/13/e6/d2465010ee0569a245c975dc6967b801887068bc893e908239b1f4b6c1ac/pandas-2.3.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff", size = 12154874, upload-time = "2025-09-29T23:33:49.939Z" }, - { url = "https://files.pythonhosted.org/packages/1f/18/aae8c0aa69a386a3255940e9317f793808ea79d0a525a97a903366bb2569/pandas-2.3.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29", size = 12790141, upload-time = "2025-09-29T23:34:05.655Z" }, - { url = "https://files.pythonhosted.org/packages/f7/26/617f98de789de00c2a444fbe6301bb19e66556ac78cff933d2c98f62f2b4/pandas-2.3.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73", size = 13208697, upload-time = "2025-09-29T23:34:21.835Z" }, - { url = "https://files.pythonhosted.org/packages/b9/fb/25709afa4552042bd0e15717c75e9b4a2294c3dc4f7e6ea50f03c5136600/pandas-2.3.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9", size = 13879233, upload-time = "2025-09-29T23:34:35.079Z" }, - { url = "https://files.pythonhosted.org/packages/98/af/7be05277859a7bc399da8ba68b88c96b27b48740b6cf49688899c6eb4176/pandas-2.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa", size = 11359119, upload-time = "2025-09-29T23:34:46.339Z" }, ] [[package]] @@ -2093,17 +1798,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f0/77/bc6f92a3e8e6e46c0ca78abfffec0037845800ea38c73483760362804c41/pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12", size = 6377370, upload-time = "2025-07-01T09:15:46.673Z" }, { url = "https://files.pythonhosted.org/packages/4a/82/3a721f7d69dca802befb8af08b7c79ebcab461007ce1c18bd91a5d5896f9/pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db", size = 7121500, upload-time = "2025-07-01T09:15:48.512Z" }, { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload-time = "2025-07-01T09:15:50.399Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8e/9c089f01677d1264ab8648352dcb7773f37da6ad002542760c80107da816/pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f", size = 5316478, upload-time = "2025-07-01T09:15:52.209Z" }, - { url = "https://files.pythonhosted.org/packages/b5/a9/5749930caf674695867eb56a581e78eb5f524b7583ff10b01b6e5048acb3/pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081", size = 4686522, upload-time = "2025-07-01T09:15:54.162Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/0b85b763eb292b691030795f9f6bb6fcaf8948c39413c81696a01c3577f7/pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4", size = 5853376, upload-time = "2025-07-03T13:11:01.066Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c6/1a230ec0067243cbd60bc2dad5dc3ab46a8a41e21c15f5c9b52b26873069/pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc", size = 7626020, upload-time = "2025-07-03T13:11:06.479Z" }, - { url = "https://files.pythonhosted.org/packages/63/dd/f296c27ffba447bfad76c6a0c44c1ea97a90cb9472b9304c94a732e8dbfb/pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06", size = 5956732, upload-time = "2025-07-01T09:15:56.111Z" }, - { url = "https://files.pythonhosted.org/packages/a5/a0/98a3630f0b57f77bae67716562513d3032ae70414fcaf02750279c389a9e/pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a", size = 6624404, upload-time = "2025-07-01T09:15:58.245Z" }, - { url = "https://files.pythonhosted.org/packages/de/e6/83dfba5646a290edd9a21964da07674409e410579c341fc5b8f7abd81620/pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978", size = 6067760, upload-time = "2025-07-01T09:16:00.003Z" }, - { url = "https://files.pythonhosted.org/packages/bc/41/15ab268fe6ee9a2bc7391e2bbb20a98d3974304ab1a406a992dcb297a370/pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d", size = 6700534, upload-time = "2025-07-01T09:16:02.29Z" }, - { url = "https://files.pythonhosted.org/packages/64/79/6d4f638b288300bed727ff29f2a3cb63db054b33518a95f27724915e3fbc/pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71", size = 6277091, upload-time = "2025-07-01T09:16:04.4Z" }, - { url = "https://files.pythonhosted.org/packages/46/05/4106422f45a05716fd34ed21763f8ec182e8ea00af6e9cb05b93a247361a/pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada", size = 6986091, upload-time = "2025-07-01T09:16:06.342Z" }, - { url = "https://files.pythonhosted.org/packages/63/c6/287fd55c2c12761d0591549d48885187579b7c257bef0c6660755b0b59ae/pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb", size = 2422632, upload-time = "2025-07-01T09:16:08.142Z" }, { url = "https://files.pythonhosted.org/packages/6f/8b/209bd6b62ce8367f47e68a218bffac88888fdf2c9fcf1ecadc6c3ec1ebc7/pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967", size = 5270556, upload-time = "2025-07-01T09:16:09.961Z" }, { url = "https://files.pythonhosted.org/packages/2e/e6/231a0b76070c2cfd9e260a7a5b504fb72da0a95279410fa7afd99d9751d6/pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe", size = 4654625, upload-time = "2025-07-01T09:16:11.913Z" }, { url = "https://files.pythonhosted.org/packages/13/f4/10cf94fda33cb12765f2397fc285fa6d8eb9c29de7f3185165b702fc7386/pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c", size = 4874207, upload-time = "2025-07-03T13:11:10.201Z" }, @@ -2252,52 +1946,13 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/91/9cb56efbb428b006bb85db28591e40b7736847b8331d43fe335acf95f6c8/propcache-0.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4445542398bd0b5d32df908031cb1b30d43ac848e20470a878b770ec2dcc6330", size = 265778, upload-time = "2025-06-09T22:55:36.45Z" }, { url = "https://files.pythonhosted.org/packages/9a/4c/b0fe775a2bdd01e176b14b574be679d84fc83958335790f7c9a686c1f468/propcache-0.3.2-cp313-cp313t-win32.whl", hash = "sha256:f86e5d7cd03afb3a1db8e9f9f6eff15794e79e791350ac48a8c924e6f439f394", size = 41175, upload-time = "2025-06-09T22:55:38.436Z" }, { url = "https://files.pythonhosted.org/packages/a4/ff/47f08595e3d9b5e149c150f88d9714574f1a7cbd89fe2817158a952674bf/propcache-0.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:9704bedf6e7cbe3c65eca4379a9b53ee6a83749f047808cbb5044d40d7d72198", size = 44857, upload-time = "2025-06-09T22:55:39.687Z" }, - { url = "https://files.pythonhosted.org/packages/6c/39/8ea9bcfaaff16fd0b0fc901ee522e24c9ec44b4ca0229cfffb8066a06959/propcache-0.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a7fad897f14d92086d6b03fdd2eb844777b0c4d7ec5e3bac0fbae2ab0602bbe5", size = 74678, upload-time = "2025-06-09T22:55:41.227Z" }, - { url = "https://files.pythonhosted.org/packages/d3/85/cab84c86966e1d354cf90cdc4ba52f32f99a5bca92a1529d666d957d7686/propcache-0.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1f43837d4ca000243fd7fd6301947d7cb93360d03cd08369969450cc6b2ce3b4", size = 43829, upload-time = "2025-06-09T22:55:42.417Z" }, - { url = "https://files.pythonhosted.org/packages/23/f7/9cb719749152d8b26d63801b3220ce2d3931312b2744d2b3a088b0ee9947/propcache-0.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:261df2e9474a5949c46e962065d88eb9b96ce0f2bd30e9d3136bcde84befd8f2", size = 43729, upload-time = "2025-06-09T22:55:43.651Z" }, - { url = "https://files.pythonhosted.org/packages/a2/a2/0b2b5a210ff311260002a315f6f9531b65a36064dfb804655432b2f7d3e3/propcache-0.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e514326b79e51f0a177daab1052bc164d9d9e54133797a3a58d24c9c87a3fe6d", size = 204483, upload-time = "2025-06-09T22:55:45.327Z" }, - { url = "https://files.pythonhosted.org/packages/3f/e0/7aff5de0c535f783b0c8be5bdb750c305c1961d69fbb136939926e155d98/propcache-0.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4a996adb6904f85894570301939afeee65f072b4fd265ed7e569e8d9058e4ec", size = 217425, upload-time = "2025-06-09T22:55:46.729Z" }, - { url = "https://files.pythonhosted.org/packages/92/1d/65fa889eb3b2a7d6e4ed3c2b568a9cb8817547a1450b572de7bf24872800/propcache-0.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76cace5d6b2a54e55b137669b30f31aa15977eeed390c7cbfb1dafa8dfe9a701", size = 214723, upload-time = "2025-06-09T22:55:48.342Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e2/eecf6989870988dfd731de408a6fa366e853d361a06c2133b5878ce821ad/propcache-0.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31248e44b81d59d6addbb182c4720f90b44e1efdc19f58112a3c3a1615fb47ef", size = 200166, upload-time = "2025-06-09T22:55:49.775Z" }, - { url = "https://files.pythonhosted.org/packages/12/06/c32be4950967f18f77489268488c7cdc78cbfc65a8ba8101b15e526b83dc/propcache-0.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abb7fa19dbf88d3857363e0493b999b8011eea856b846305d8c0512dfdf8fbb1", size = 194004, upload-time = "2025-06-09T22:55:51.335Z" }, - { url = "https://files.pythonhosted.org/packages/46/6c/17b521a6b3b7cbe277a4064ff0aa9129dd8c89f425a5a9b6b4dd51cc3ff4/propcache-0.3.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d81ac3ae39d38588ad0549e321e6f773a4e7cc68e7751524a22885d5bbadf886", size = 203075, upload-time = "2025-06-09T22:55:52.681Z" }, - { url = "https://files.pythonhosted.org/packages/62/cb/3bdba2b736b3e45bc0e40f4370f745b3e711d439ffbffe3ae416393eece9/propcache-0.3.2-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:cc2782eb0f7a16462285b6f8394bbbd0e1ee5f928034e941ffc444012224171b", size = 195407, upload-time = "2025-06-09T22:55:54.048Z" }, - { url = "https://files.pythonhosted.org/packages/29/bd/760c5c6a60a4a2c55a421bc34a25ba3919d49dee411ddb9d1493bb51d46e/propcache-0.3.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:db429c19a6c7e8a1c320e6a13c99799450f411b02251fb1b75e6217cf4a14fcb", size = 196045, upload-time = "2025-06-09T22:55:55.485Z" }, - { url = "https://files.pythonhosted.org/packages/76/58/ced2757a46f55b8c84358d6ab8de4faf57cba831c51e823654da7144b13a/propcache-0.3.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:21d8759141a9e00a681d35a1f160892a36fb6caa715ba0b832f7747da48fb6ea", size = 208432, upload-time = "2025-06-09T22:55:56.884Z" }, - { url = "https://files.pythonhosted.org/packages/bb/ec/d98ea8d5a4d8fe0e372033f5254eddf3254344c0c5dc6c49ab84349e4733/propcache-0.3.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2ca6d378f09adb13837614ad2754fa8afaee330254f404299611bce41a8438cb", size = 210100, upload-time = "2025-06-09T22:55:58.498Z" }, - { url = "https://files.pythonhosted.org/packages/56/84/b6d8a7ecf3f62d7dd09d9d10bbf89fad6837970ef868b35b5ffa0d24d9de/propcache-0.3.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:34a624af06c048946709f4278b4176470073deda88d91342665d95f7c6270fbe", size = 200712, upload-time = "2025-06-09T22:55:59.906Z" }, - { url = "https://files.pythonhosted.org/packages/bf/32/889f4903ddfe4a9dc61da71ee58b763758cf2d608fe1decede06e6467f8d/propcache-0.3.2-cp39-cp39-win32.whl", hash = "sha256:4ba3fef1c30f306b1c274ce0b8baaa2c3cdd91f645c48f06394068f37d3837a1", size = 38187, upload-time = "2025-06-09T22:56:01.212Z" }, - { url = "https://files.pythonhosted.org/packages/67/74/d666795fb9ba1dc139d30de64f3b6fd1ff9c9d3d96ccfdb992cd715ce5d2/propcache-0.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:7a2368eed65fc69a7a7a40b27f22e85e7627b74216f0846b04ba5c116e191ec9", size = 42025, upload-time = "2025-06-09T22:56:02.875Z" }, { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] -[[package]] -name = "protobuf" -version = "3.19.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/51/d1/79bfd1f481469b661a2eddab551255536401892722189433282bfb13cfb1/protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4", size = 218071, upload-time = "2022-09-29T22:07:23.03Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/3b/90f805b9e5ecacf8a216f2e5acabc2d3ad965b62803510be41804e6bfbfe/protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1", size = 913631, upload-time = "2022-09-29T21:17:39.095Z" }, - { url = "https://files.pythonhosted.org/packages/26/ef/bd6ba3b4ff9a35944bdd325e2c9ee56f71e855757f7d43938232499f0278/protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6", size = 1055327, upload-time = "2022-09-29T21:17:41.054Z" }, - { url = "https://files.pythonhosted.org/packages/bc/db/8b33c9558f1f27dd74e7f9ad730c6b32efab431419af556b1659e125b041/protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39", size = 913657, upload-time = "2022-09-29T21:18:18.359Z" }, - { url = "https://files.pythonhosted.org/packages/51/61/e80b7a04f4e1b4eecc86582335205fd876abca0abafee4a6c001f70a375e/protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0", size = 1055457, upload-time = "2022-09-29T21:18:20.212Z" }, - { url = "https://files.pythonhosted.org/packages/32/27/1141a8232723dcb10a595cc0ce4321dcbbd5215300bf4acfc142343205bf/protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4", size = 162648, upload-time = "2022-09-29T22:07:20.303Z" }, -] - [[package]] name = "protobuf" version = "6.32.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/fa/a4/cc17347aa2897568beece2e674674359f911d6fe21b0b8d6268cd42727ac/protobuf-6.32.1.tar.gz", hash = "sha256:ee2469e4a021474ab9baafea6cd070e5bf27c7d29433504ddea1a4ee5850f68d", size = 440635, upload-time = "2025-09-11T21:38:42.935Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3f/be/8dd0a927c559b37d7a6c8ab79034fd167dcc1f851595f2e641ad62be8643/protobuf-6.32.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:2f5b80a49e1eb7b86d85fcd23fe92df154b9730a725c3b38c4e43b9d77018bf4", size = 322874, upload-time = "2025-09-11T21:38:35.509Z" }, @@ -2335,8 +1990,7 @@ name = "pyarrow" version = "21.0.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", - "python_full_version < '3.10'", + "python_full_version < '3.11'", ] sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } wheels = [ @@ -2375,13 +2029,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, - { url = "https://files.pythonhosted.org/packages/3e/cc/ce4939f4b316457a083dc5718b3982801e8c33f921b3c98e7a93b7c7491f/pyarrow-21.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a7f6524e3747e35f80744537c78e7302cd41deee8baa668d56d55f77d9c464b3", size = 31211248, upload-time = "2025-07-18T00:56:59.7Z" }, - { url = "https://files.pythonhosted.org/packages/1f/c2/7a860931420d73985e2f340f06516b21740c15b28d24a0e99a900bb27d2b/pyarrow-21.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:203003786c9fd253ebcafa44b03c06983c9c8d06c3145e37f1b76a1f317aeae1", size = 32676896, upload-time = "2025-07-18T00:57:03.884Z" }, - { url = "https://files.pythonhosted.org/packages/68/a8/197f989b9a75e59b4ca0db6a13c56f19a0ad8a298c68da9cc28145e0bb97/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3b4d97e297741796fead24867a8dabf86c87e4584ccc03167e4a811f50fdf74d", size = 41067862, upload-time = "2025-07-18T00:57:07.587Z" }, - { url = "https://files.pythonhosted.org/packages/fa/82/6ecfa89487b35aa21accb014b64e0a6b814cc860d5e3170287bf5135c7d8/pyarrow-21.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:898afce396b80fdda05e3086b4256f8677c671f7b1d27a6976fa011d3fd0a86e", size = 42747508, upload-time = "2025-07-18T00:57:13.917Z" }, - { url = "https://files.pythonhosted.org/packages/3b/b7/ba252f399bbf3addc731e8643c05532cf32e74cebb5e32f8f7409bc243cf/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:067c66ca29aaedae08218569a114e413b26e742171f526e828e1064fcdec13f4", size = 43345293, upload-time = "2025-07-18T00:57:19.828Z" }, - { url = "https://files.pythonhosted.org/packages/ff/0a/a20819795bd702b9486f536a8eeb70a6aa64046fce32071c19ec8230dbaa/pyarrow-21.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0c4e75d13eb76295a49e0ea056eb18dbd87d81450bfeb8afa19a7e5a75ae2ad7", size = 45060670, upload-time = "2025-07-18T00:57:24.477Z" }, - { url = "https://files.pythonhosted.org/packages/10/15/6b30e77872012bbfe8265d42a01d5b3c17ef0ac0f2fae531ad91b6a6c02e/pyarrow-21.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cdc4c17afda4dab2a9c0b79148a43a7f4e1094916b3e18d8975bfd6d6d52241f", size = 26227521, upload-time = "2025-07-18T00:57:29.119Z" }, ] [[package]] @@ -2554,19 +2201,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, - { url = "https://files.pythonhosted.org/packages/54/db/160dffb57ed9a3705c4cbcbff0ac03bdae45f1ca7d58ab74645550df3fbd/pydantic_core-2.41.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:8bfeaf8735be79f225f3fefab7f941c712aaca36f1128c9d7e2352ee1aa87bdf", size = 2107999, upload-time = "2025-11-04T13:42:03.885Z" }, - { url = "https://files.pythonhosted.org/packages/a3/7d/88e7de946f60d9263cc84819f32513520b85c0f8322f9b8f6e4afc938383/pydantic_core-2.41.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:346285d28e4c8017da95144c7f3acd42740d637ff41946af5ce6e5e420502dd5", size = 1929745, upload-time = "2025-11-04T13:42:06.075Z" }, - { url = "https://files.pythonhosted.org/packages/d5/c2/aef51e5b283780e85e99ff19db0f05842d2d4a8a8cd15e63b0280029b08f/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a75dafbf87d6276ddc5b2bf6fae5254e3d0876b626eb24969a574fff9149ee5d", size = 1920220, upload-time = "2025-11-04T13:42:08.457Z" }, - { url = "https://files.pythonhosted.org/packages/c7/97/492ab10f9ac8695cd76b2fdb24e9e61f394051df71594e9bcc891c9f586e/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7b93a4d08587e2b7e7882de461e82b6ed76d9026ce91ca7915e740ecc7855f60", size = 2067296, upload-time = "2025-11-04T13:42:10.817Z" }, - { url = "https://files.pythonhosted.org/packages/ec/23/984149650e5269c59a2a4c41d234a9570adc68ab29981825cfaf4cfad8f4/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8465ab91a4bd96d36dde3263f06caa6a8a6019e4113f24dc753d79a8b3a3f82", size = 2231548, upload-time = "2025-11-04T13:42:13.843Z" }, - { url = "https://files.pythonhosted.org/packages/71/0c/85bcbb885b9732c28bec67a222dbed5ed2d77baee1f8bba2002e8cd00c5c/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:299e0a22e7ae2b85c1a57f104538b2656e8ab1873511fd718a1c1c6f149b77b5", size = 2362571, upload-time = "2025-11-04T13:42:16.208Z" }, - { url = "https://files.pythonhosted.org/packages/c0/4a/412d2048be12c334003e9b823a3fa3d038e46cc2d64dd8aab50b31b65499/pydantic_core-2.41.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:707625ef0983fcfb461acfaf14de2067c5942c6bb0f3b4c99158bed6fedd3cf3", size = 2068175, upload-time = "2025-11-04T13:42:18.911Z" }, - { url = "https://files.pythonhosted.org/packages/73/f4/c58b6a776b502d0a5540ad02e232514285513572060f0d78f7832ca3c98b/pydantic_core-2.41.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f41eb9797986d6ebac5e8edff36d5cef9de40def462311b3eb3eeded1431e425", size = 2177203, upload-time = "2025-11-04T13:42:22.578Z" }, - { url = "https://files.pythonhosted.org/packages/ed/ae/f06ea4c7e7a9eead3d165e7623cd2ea0cb788e277e4f935af63fc98fa4e6/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0384e2e1021894b1ff5a786dbf94771e2986ebe2869533874d7e43bc79c6f504", size = 2148191, upload-time = "2025-11-04T13:42:24.89Z" }, - { url = "https://files.pythonhosted.org/packages/c1/57/25a11dcdc656bf5f8b05902c3c2934ac3ea296257cc4a3f79a6319e61856/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_armv7l.whl", hash = "sha256:f0cd744688278965817fd0839c4a4116add48d23890d468bc436f78beb28abf5", size = 2343907, upload-time = "2025-11-04T13:42:27.683Z" }, - { url = "https://files.pythonhosted.org/packages/96/82/e33d5f4933d7a03327c0c43c65d575e5919d4974ffc026bc917a5f7b9f61/pydantic_core-2.41.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:753e230374206729bf0a807954bcc6c150d3743928a73faffee51ac6557a03c3", size = 2322174, upload-time = "2025-11-04T13:42:30.776Z" }, - { url = "https://files.pythonhosted.org/packages/81/45/4091be67ce9f469e81656f880f3506f6a5624121ec5eb3eab37d7581897d/pydantic_core-2.41.5-cp39-cp39-win32.whl", hash = "sha256:873e0d5b4fb9b89ef7c2d2a963ea7d02879d9da0da8d9d4933dee8ee86a8b460", size = 1990353, upload-time = "2025-11-04T13:42:33.111Z" }, - { url = "https://files.pythonhosted.org/packages/44/8a/a98aede18db6e9cd5d66bcacd8a409fcf8134204cdede2e7de35c5a2c5ef/pydantic_core-2.41.5-cp39-cp39-win_amd64.whl", hash = "sha256:e4f4a984405e91527a0d62649ee21138f8e3d0ef103be488c1dc11a80d7f184b", size = 2015698, upload-time = "2025-11-04T13:42:35.484Z" }, { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, @@ -2607,8 +2241,7 @@ name = "pylance" source = { editable = "." } dependencies = [ { name = "lance-namespace" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pyarrow", version = "21.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "pyarrow", version = "23.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -2628,9 +2261,8 @@ geo = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "0.0.9", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2638,8 +2270,7 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.7.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' and sys_platform == 'linux'" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] torch = [ @@ -2657,8 +2288,8 @@ dev = [ ] tests = [ { name = "boto3" }, - { name = "datafusion", marker = "python_full_version >= '3.10'" }, - { name = "datasets", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "datafusion" }, + { name = "datasets" }, { name = "duckdb" }, { name = "ml-dtypes" }, { name = "pandas" }, @@ -2666,14 +2297,14 @@ tests = [ { name = "polars", extra = ["pandas", "pyarrow"] }, { name = "psutil" }, { name = "pytest" }, - { name = "tensorflow", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'linux'" }, + { name = "tensorflow", marker = "sys_platform == 'linux'" }, { name = "tqdm" }, ] [package.metadata] requires-dist = [ { name = "boto3", marker = "extra == 'tests'" }, - { name = "datafusion", marker = "python_full_version >= '3.10' and extra == 'tests'", specifier = ">=53,<54" }, + { name = "datafusion", marker = "extra == 'tests'", specifier = ">=53,<54" }, { name = "datasets", marker = "extra == 'tests'" }, { name = "duckdb", marker = "extra == 'tests'" }, { name = "geoarrow-rust-core", marker = "extra == 'geo'" }, @@ -2705,8 +2336,8 @@ dev = [ ] tests = [ { name = "boto3", specifier = "==1.40.43" }, - { name = "datafusion", marker = "python_full_version >= '3.10'", specifier = "==53.0.0" }, - { name = "datasets", marker = "python_full_version >= '3.10'", specifier = "==4.1.1" }, + { name = "datafusion", specifier = "==53.0.0" }, + { name = "datasets", specifier = "==4.1.1" }, { name = "duckdb", specifier = "==1.4.0" }, { name = "ml-dtypes", specifier = "==0.5.3" }, { name = "pandas", specifier = "==2.3.3" }, @@ -2714,59 +2345,19 @@ tests = [ { name = "polars", extras = ["pyarrow", "pandas"], specifier = "==1.34.0" }, { name = "psutil", specifier = "==7.1.0" }, { name = "pytest", specifier = "==8.4.2" }, - { name = "tensorflow", marker = "python_full_version >= '3.10' and sys_platform == 'linux'", specifier = "==2.20.0" }, + { name = "tensorflow", marker = "sys_platform == 'linux'", specifier = "==2.20.0" }, { name = "tqdm", specifier = "==4.67.1" }, ] -[[package]] -name = "pyproj" -version = "3.6.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "certifi", marker = "python_full_version < '3.10'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/84/2b39bbf888c753ea48b40d47511548c77aa03445465c35cc4c4e9649b643/pyproj-3.6.1.tar.gz", hash = "sha256:44aa7c704c2b7d8fb3d483bbf75af6cb2350d30a63b144279a09b75fead501bf", size = 225131, upload-time = "2023-09-21T02:07:51.593Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/32/63cf474f4a8d4804b3bdf7c16b8589f38142e8e2f8319dcea27e0bc21a87/pyproj-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab7aa4d9ff3c3acf60d4b285ccec134167a948df02347585fdd934ebad8811b4", size = 6142763, upload-time = "2023-09-21T02:07:12.844Z" }, - { url = "https://files.pythonhosted.org/packages/18/86/2e7cb9de40492f1bafbf11f4c9072edc394509a40b5e4c52f8139546f039/pyproj-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4bc0472302919e59114aa140fd7213c2370d848a7249d09704f10f5b062031fe", size = 4877123, upload-time = "2023-09-21T02:10:37.905Z" }, - { url = "https://files.pythonhosted.org/packages/5e/c5/928d5a26995dbefbebd7507d982141cd9153bc7e4392b334fff722c4af12/pyproj-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5279586013b8d6582e22b6f9e30c49796966770389a9d5b85e25a4223286cd3f", size = 6190576, upload-time = "2023-09-21T02:17:08.637Z" }, - { url = "https://files.pythonhosted.org/packages/f6/2b/b60cf73b0720abca313bfffef34e34f7f7dae23852b2853cf0368d49426b/pyproj-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fafd1f3eb421694857f254a9bdbacd1eb22fc6c24ca74b136679f376f97d35", size = 8328075, upload-time = "2023-09-21T02:07:15.353Z" }, - { url = "https://files.pythonhosted.org/packages/d9/a8/7193f46032636be917bc775506ae987aad72c931b1f691b775ca812a2917/pyproj-3.6.1-cp310-cp310-win32.whl", hash = "sha256:c41e80ddee130450dcb8829af7118f1ab69eaf8169c4bf0ee8d52b72f098dc2f", size = 5635713, upload-time = "2023-09-21T02:07:17.548Z" }, - { url = "https://files.pythonhosted.org/packages/89/8f/27350c8fba71a37cd0d316f100fbd96bf139cc2b5ff1ab0dcbc7ac64010a/pyproj-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:db3aedd458e7f7f21d8176f0a1d924f1ae06d725228302b872885a1c34f3119e", size = 6087932, upload-time = "2023-09-21T02:07:19.793Z" }, - { url = "https://files.pythonhosted.org/packages/84/a6/a300c1b14b2112e966e9f90b18f9c13b586bdcf417207cee913ae9005da3/pyproj-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebfbdbd0936e178091309f6cd4fcb4decd9eab12aa513cdd9add89efa3ec2882", size = 6147442, upload-time = "2023-09-21T02:07:21.879Z" }, - { url = "https://files.pythonhosted.org/packages/30/bd/b9bd3761f08754e8dbb34c5a647db2099b348ab5da338e90980caf280e37/pyproj-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:447db19c7efad70ff161e5e46a54ab9cc2399acebb656b6ccf63e4bc4a04b97a", size = 4880331, upload-time = "2023-09-21T02:10:40.828Z" }, - { url = "https://files.pythonhosted.org/packages/f4/0a/d82aeeb605b5d6870bc72307c3b5e044e632eb7720df8885e144f51a8eac/pyproj-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e7e13c40183884ec7f94eb8e0f622f08f1d5716150b8d7a134de48c6110fee85", size = 6192425, upload-time = "2023-09-21T02:17:09.049Z" }, - { url = "https://files.pythonhosted.org/packages/64/90/dfe5c00de1ca4dbb82606e79790659d4ed7f0ed8d372bccb3baca2a5abe0/pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65ad699e0c830e2b8565afe42bd58cc972b47d829b2e0e48ad9638386d994915", size = 8571478, upload-time = "2023-09-21T02:07:23.771Z" }, - { url = "https://files.pythonhosted.org/packages/14/6d/ae373629a1723f0db80d7b8c93598b00d9ecb930ed9ebf4f35826a33e97c/pyproj-3.6.1-cp311-cp311-win32.whl", hash = "sha256:8b8acc31fb8702c54625f4d5a2a6543557bec3c28a0ef638778b7ab1d1772132", size = 5634575, upload-time = "2023-09-21T02:07:26.535Z" }, - { url = "https://files.pythonhosted.org/packages/79/95/eb68113c5b5737c342bde1bab92705dabe69c16299c5a122616e50f1fbd6/pyproj-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:38a3361941eb72b82bd9a18f60c78b0df8408416f9340521df442cebfc4306e2", size = 6088494, upload-time = "2023-09-21T02:07:28.75Z" }, - { url = "https://files.pythonhosted.org/packages/0b/64/93232511a7906a492b1b7dfdfc17f4e95982d76a24ef4f86d18cfe7ae2c9/pyproj-3.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1e9fbaf920f0f9b4ee62aab832be3ae3968f33f24e2e3f7fbb8c6728ef1d9746", size = 6135280, upload-time = "2023-09-21T02:07:30.911Z" }, - { url = "https://files.pythonhosted.org/packages/10/f2/b550b1f65cc7e51c9116b220b50aade60c439103432a3fd5b12efbc77e15/pyproj-3.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d227a865356f225591b6732430b1d1781e946893789a609bb34f59d09b8b0f8", size = 4880030, upload-time = "2023-09-21T02:10:43.067Z" }, - { url = "https://files.pythonhosted.org/packages/fe/4b/2f8f6f94643b9fe2083338eff294feda84d916409b5840b7a402d2be93f8/pyproj-3.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83039e5ae04e5afc974f7d25ee0870a80a6bd6b7957c3aca5613ccbe0d3e72bf", size = 6184439, upload-time = "2023-09-21T02:17:43.499Z" }, - { url = "https://files.pythonhosted.org/packages/19/9b/c57569132174786aa3f72275ac306956859a639dad0ce8d95c8411ce8209/pyproj-3.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb059ba3bced6f6725961ba758649261d85ed6ce670d3e3b0a26e81cf1aa8d", size = 8660747, upload-time = "2023-09-21T02:07:32.586Z" }, - { url = "https://files.pythonhosted.org/packages/0e/ab/1c2159ec757677c5a6b8803f6be45c2b550dc42c84ec4a228dc219849bbb/pyproj-3.6.1-cp312-cp312-win32.whl", hash = "sha256:2d6ff73cc6dbbce3766b6c0bce70ce070193105d8de17aa2470009463682a8eb", size = 5626805, upload-time = "2023-09-21T02:07:35.28Z" }, - { url = "https://files.pythonhosted.org/packages/c7/f3/2f32fe143cd7ba1d4d68f1b6dce9ca402d909cbd5a5830e3a8fa3d1acbbf/pyproj-3.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:7a27151ddad8e1439ba70c9b4b2b617b290c39395fa9ddb7411ebb0eb86d6fb0", size = 6079779, upload-time = "2023-09-21T02:07:37.486Z" }, - { url = "https://files.pythonhosted.org/packages/d7/50/d369bbe62d7a0d1e2cb40bc211da86a3f6e0f3c99f872957a72c3d5492d6/pyproj-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ba1f9b03d04d8cab24d6375609070580a26ce76eaed54631f03bab00a9c737b", size = 6144755, upload-time = "2023-09-21T02:07:39.611Z" }, - { url = "https://files.pythonhosted.org/packages/2c/c2/8d4f61065dfed965e53badd41201ad86a05af0c1bbc75dffb12ef0f5a7dd/pyproj-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18faa54a3ca475bfe6255156f2f2874e9a1c8917b0004eee9f664b86ccc513d3", size = 4879187, upload-time = "2023-09-21T02:10:45.519Z" }, - { url = "https://files.pythonhosted.org/packages/31/38/2cf8777cb2d5622a78195e690281b7029098795fde4751aec8128238b8bb/pyproj-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd43bd9a9b9239805f406fd82ba6b106bf4838d9ef37c167d3ed70383943ade1", size = 6192339, upload-time = "2023-09-21T02:17:09.942Z" }, - { url = "https://files.pythonhosted.org/packages/97/0a/b1525be9680369cc06dd288e12c59d24d5798b4afcdcf1b0915836e1caa6/pyproj-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50100b2726a3ca946906cbaa789dd0749f213abf0cbb877e6de72ca7aa50e1ae", size = 8332638, upload-time = "2023-09-21T02:07:41.777Z" }, - { url = "https://files.pythonhosted.org/packages/8d/e8/e826e0a962f36bd925a933829cf6ef218efe2055db5ea292be40974a929d/pyproj-3.6.1-cp39-cp39-win32.whl", hash = "sha256:9274880263256f6292ff644ca92c46d96aa7e57a75c6df3f11d636ce845a1877", size = 5638159, upload-time = "2023-09-21T02:07:43.49Z" }, - { url = "https://files.pythonhosted.org/packages/43/d0/cbe29a4dcf38ee7e72bf695d0d3f2bee21b4f22ee6cf579ad974de9edfc8/pyproj-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:36b64c2cb6ea1cc091f329c5bd34f9c01bb5da8c8e4492c709bda6a09f96808f", size = 6090565, upload-time = "2023-09-21T02:07:45.735Z" }, - { url = "https://files.pythonhosted.org/packages/43/28/e8d2ca71dd56c27cbe668e4226963d61956cded222a2e839e6fec1ab6d82/pyproj-3.6.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fd93c1a0c6c4aedc77c0fe275a9f2aba4d59b8acf88cebfc19fe3c430cfabf4f", size = 6034252, upload-time = "2023-09-21T02:07:47.906Z" }, - { url = "https://files.pythonhosted.org/packages/cb/39/1ce27cb86f51a1f5aed3a1617802a6131b59ea78492141d1fbe36722595e/pyproj-3.6.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6420ea8e7d2a88cb148b124429fba8cd2e0fae700a2d96eab7083c0928a85110", size = 6386263, upload-time = "2023-09-21T02:07:49.586Z" }, -] - [[package]] name = "pyproj" version = "3.7.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.10.*'", + "python_full_version < '3.11'", ] dependencies = [ - { name = "certifi", marker = "python_full_version == '3.10.*'" }, + { name = "certifi", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/67/10/a8480ea27ea4bbe896c168808854d00f2a9b49f95c0319ddcbba693c8a90/pyproj-3.7.1.tar.gz", hash = "sha256:60d72facd7b6b79853f19744779abcd3f804c4e0d4fa8815469db20c9f640a47", size = 226339, upload-time = "2025-02-16T04:28:46.621Z" } wheels = [ @@ -3002,15 +2593,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, - { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" }, - { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" }, - { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" }, - { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" }, - { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" }, - { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" }, - { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" }, - { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" }, - { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" }, ] [[package]] @@ -3018,10 +2600,10 @@ name = "requests" version = "2.33.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "certifi", marker = "python_full_version >= '3.10'" }, - { name = "charset-normalizer", marker = "python_full_version >= '3.10'" }, - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "urllib3", version = "2.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, ] sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } wheels = [ @@ -3033,8 +2615,8 @@ name = "rich" version = "14.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markdown-it-py", marker = "python_full_version >= '3.10'" }, - { name = "pygments", marker = "python_full_version >= '3.10'" }, + { name = "markdown-it-py" }, + { name = "pygments" }, ] sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } wheels = [ @@ -3116,13 +2698,11 @@ dependencies = [ { name = "absl-py" }, { name = "grpcio" }, { name = "markdown" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pillow" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "protobuf" }, { name = "setuptools" }, { name = "tensorboard-data-server" }, { name = "werkzeug" }, @@ -3140,74 +2720,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" }, ] -[[package]] -name = "tensorflow" -version = "2.7.4" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -dependencies = [ - { name = "absl-py", marker = "python_full_version < '3.10'" }, - { name = "astunparse", marker = "python_full_version < '3.10'" }, - { name = "flatbuffers", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "gast", version = "0.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "google-pasta", marker = "python_full_version < '3.10'" }, - { name = "grpcio", marker = "python_full_version < '3.10'" }, - { name = "h5py", marker = "python_full_version < '3.10'" }, - { name = "keras", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "keras-preprocessing", marker = "python_full_version < '3.10'" }, - { name = "libclang", marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "opt-einsum", marker = "python_full_version < '3.10'" }, - { name = "protobuf", version = "3.19.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "six", marker = "python_full_version < '3.10'" }, - { name = "tensorboard", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-estimator", marker = "python_full_version < '3.10'" }, - { name = "tensorflow-io-gcs-filesystem", marker = "python_full_version < '3.10'" }, - { name = "termcolor", marker = "python_full_version < '3.10'" }, - { name = "typing-extensions", marker = "python_full_version < '3.10'" }, - { name = "wheel", marker = "python_full_version < '3.10'" }, - { name = "wrapt", marker = "python_full_version < '3.10'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/5e/31/d49a3dff9c4ca6e6c09c2c5fea95f58cf59cc3cd4f0d557069c7dccd6f57/tensorflow-2.7.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c4597635dd71fc6809b7fffcb462524d73e2ade09da61844059e6a2fead71140", size = 496066688, upload-time = "2022-09-02T19:11:01.631Z" }, -] - [[package]] name = "tensorflow" version = "2.20.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] dependencies = [ - { name = "absl-py", marker = "python_full_version >= '3.10'" }, - { name = "astunparse", marker = "python_full_version >= '3.10'" }, - { name = "flatbuffers", version = "25.9.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "gast", version = "0.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "google-pasta", marker = "python_full_version >= '3.10'" }, - { name = "grpcio", marker = "python_full_version >= '3.10'" }, - { name = "h5py", marker = "python_full_version >= '3.10'" }, - { name = "keras", version = "3.11.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "libclang", marker = "python_full_version >= '3.10'" }, - { name = "ml-dtypes", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "absl-py" }, + { name = "astunparse" }, + { name = "flatbuffers" }, + { name = "gast" }, + { name = "google-pasta" }, + { name = "grpcio" }, + { name = "h5py" }, + { name = "keras" }, + { name = "libclang" }, + { name = "ml-dtypes" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "opt-einsum", marker = "python_full_version >= '3.10'" }, - { name = "packaging", marker = "python_full_version >= '3.10'" }, - { name = "protobuf", version = "6.32.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "requests", marker = "python_full_version >= '3.10'" }, - { name = "setuptools", marker = "python_full_version >= '3.10'" }, - { name = "six", marker = "python_full_version >= '3.10'" }, - { name = "tensorboard", marker = "python_full_version >= '3.10'" }, - { name = "termcolor", marker = "python_full_version >= '3.10'" }, - { name = "typing-extensions", marker = "python_full_version >= '3.10'" }, - { name = "wrapt", marker = "python_full_version >= '3.10'" }, + { name = "opt-einsum" }, + { name = "packaging" }, + { name = "protobuf" }, + { name = "requests" }, + { name = "setuptools" }, + { name = "six" }, + { name = "tensorboard" }, + { name = "termcolor" }, + { name = "typing-extensions" }, + { name = "wrapt" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ff/07/ea91ac67a9fd36d3372099f5a3e69860ded544f877f5f2117802388f4212/tensorflow-2.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02a0293d94f5c8b7125b66abf622cc4854a33ae9d618a0d41309f95e091bbaea", size = 259307122, upload-time = "2025-08-13T16:50:47.909Z" }, @@ -3218,31 +2757,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547, upload-time = "2025-08-13T16:52:46.396Z" }, { url = "https://files.pythonhosted.org/packages/ea/4c/c1aa90c5cc92e9f7f9c78421e121ef25bae7d378f8d1d4cbad46c6308836/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c88e05a07f1ead4977b4894b3ecd4d8075c40191065afc4fd9355c9db3d926", size = 259663776, upload-time = "2025-08-13T16:53:24.507Z" }, { url = "https://files.pythonhosted.org/packages/43/fb/8be8547c128613d82a2b006004026d86ed0bd672e913029a98153af4ffab/tensorflow-2.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa3729b0126f75a99882b89fb7d536515721eda8014a63e259e780ba0a37372", size = 620815537, upload-time = "2025-08-13T16:53:42.577Z" }, - { url = "https://files.pythonhosted.org/packages/83/ff/a26d49895586207b2704403366ef976dcaa6ed07514699dae9a4fc3fa1a9/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28bc33759249c98eabcee9debd24e74506bbe29ac139e050cf0c74aa9888ebdf", size = 259307564, upload-time = "2025-08-13T16:54:17.691Z" }, - { url = "https://files.pythonhosted.org/packages/5f/fe/f3d738dc7c93ed5f67f9ace8dd3ed66971dab7c5a47f2d1c504ef0d0cf1d/tensorflow-2.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0deb5c583dfc53b54fd158a194ce0087b406bb6518af400ca3809735e4548ec3", size = 620427169, upload-time = "2025-08-13T16:54:33.431Z" }, -] - -[[package]] -name = "tensorflow-estimator" -version = "2.7.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/db/de/3a71ad41b87f9dd424e3aec3b0794a60f169fa7e9a9a1e3dd44290b86dd6/tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d", size = 463110, upload-time = "2021-10-29T23:02:47.14Z" }, -] - -[[package]] -name = "tensorflow-io-gcs-filesystem" -version = "0.37.1" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/19/9095c69e22c879cb3896321e676c69273a549a3148c4f62aa4bc5ebdb20f/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70", size = 4842078, upload-time = "2024-07-01T23:44:18.977Z" }, - { url = "https://files.pythonhosted.org/packages/f3/48/47b7d25572961a48b1de3729b7a11e835b888e41e0203cca82df95d23b91/tensorflow_io_gcs_filesystem-0.37.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27", size = 5085736, upload-time = "2024-07-01T23:44:21.034Z" }, - { url = "https://files.pythonhosted.org/packages/de/bf/ba597d3884c77d05a78050f3c178933d69e3f80200a261df6eaa920656cd/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda", size = 4842079, upload-time = "2024-07-01T23:44:26.825Z" }, - { url = "https://files.pythonhosted.org/packages/66/7f/e36ae148c2f03d61ca1bff24bc13a0fef6d6825c966abef73fc6f880a23b/tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556", size = 5085736, upload-time = "2024-07-01T23:44:28.618Z" }, - { url = "https://files.pythonhosted.org/packages/d3/46/962f47af08bd39fc9feb280d3192825431a91a078c856d17a78ae4884eb1/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f", size = 4842077, upload-time = "2024-07-01T23:44:33.86Z" }, - { url = "https://files.pythonhosted.org/packages/f0/9b/790d290c232bce9b691391cf16e95a96e469669c56abfb1d9d0f35fa437c/tensorflow_io_gcs_filesystem-0.37.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c", size = 5085733, upload-time = "2024-07-01T23:44:36.663Z" }, - { url = "https://files.pythonhosted.org/packages/66/5f/334a011caa1eb97689274d1141df8e6b7a25e389f0390bdcd90235de9783/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed", size = 4842075, upload-time = "2024-07-01T23:44:42.094Z" }, - { url = "https://files.pythonhosted.org/packages/3d/cb/7dcee55fc5a7d7d8a862e12519322851cd5fe5b086f946fd71e4ae1ef281/tensorflow_io_gcs_filesystem-0.37.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95", size = 5087496, upload-time = "2024-07-01T23:44:43.797Z" }, ] [[package]] @@ -3301,8 +2815,7 @@ dependencies = [ { name = "filelock" }, { name = "fsspec" }, { name = "jinja2" }, - { name = "networkx", version = "3.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -3344,10 +2857,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/15/5e488ca0bc6162c86a33b58642bc577c84ded17c7b72d97e49b5833e2d73/torch-2.8.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8f0a9d617a66509ded240add3754e462430a6c1fc5589f86c17b433dd808f97a", size = 887990692, upload-time = "2025-08-06T14:56:18.286Z" }, { url = "https://files.pythonhosted.org/packages/b4/a8/6a04e4b54472fc5dba7ca2341ab219e529f3c07b6941059fbf18dccac31f/torch-2.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a7242b86f42be98ac674b88a4988643b9bc6145437ec8f048fea23f72feb5eca", size = 241603453, upload-time = "2025-08-06T14:55:22.945Z" }, { url = "https://files.pythonhosted.org/packages/04/6e/650bb7f28f771af0cb791b02348db8b7f5f64f40f6829ee82aa6ce99aabe/torch-2.8.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:7b677e17f5a3e69fdef7eb3b9da72622f8d322692930297e4ccb52fefc6c8211", size = 73632395, upload-time = "2025-08-06T14:55:28.645Z" }, - { url = "https://files.pythonhosted.org/packages/5b/b0/a321f27270049baa12f5c3fb0d6ceea005634787e3af9a8d75dce8306b0a/torch-2.8.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:da6afa31c13b669d4ba49d8a2169f0db2c3ec6bec4af898aa714f401d4c38904", size = 102059214, upload-time = "2025-08-06T14:55:33.433Z" }, - { url = "https://files.pythonhosted.org/packages/fd/dd/1630cb51b10d3d2e97db95e5a84c32def81fc26b005bce6fc880b0e6db81/torch-2.8.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:06fcee8000e5c62a9f3e52a688b9c5abb7c6228d0e56e3452983416025c41381", size = 888024302, upload-time = "2025-08-06T14:57:28.23Z" }, - { url = "https://files.pythonhosted.org/packages/b9/dc/1f1f621afe15e3c496e1e8f94f8903f75f87e7d642d5a985e92210cc208d/torch-2.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:5128fe752a355d9308e56af1ad28b15266fe2da5948660fad44de9e3a9e36e8c", size = 241249338, upload-time = "2025-08-06T14:57:05.669Z" }, - { url = "https://files.pythonhosted.org/packages/ae/95/ae26263aceb3d57b821179f827d0e321373ed49423e603dd5906ab14a730/torch-2.8.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:e9f071f5b52a9f6970dc8a919694b27a91ae9dc08898b2b988abbef5eddfd1ae", size = 73610795, upload-time = "2025-08-06T14:57:11.513Z" }, ] [[package]] @@ -3367,7 +2876,6 @@ name = "triton" version = "3.4.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, { name = "setuptools" }, ] wheels = [ @@ -3376,7 +2884,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/66/b1eb52839f563623d185f0927eb3530ee4d5ffe9d377cdaf5346b306689e/triton-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c1d84a5c0ec2c0f8e8a072d7fd150cab84a9c239eaddc6706c081bfae4eb04", size = 155560068, upload-time = "2025-07-30T19:58:37.081Z" }, { url = "https://files.pythonhosted.org/packages/30/7b/0a685684ed5322d2af0bddefed7906674f67974aa88b0fae6e82e3b766f6/triton-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00be2964616f4c619193cb0d1b29a99bd4b001d7dc333816073f92cf2a8ccdeb", size = 155569223, upload-time = "2025-07-30T19:58:44.017Z" }, { url = "https://files.pythonhosted.org/packages/20/63/8cb444ad5cdb25d999b7d647abac25af0ee37d292afc009940c05b82dda0/triton-3.4.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7936b18a3499ed62059414d7df563e6c163c5e16c3773678a3ee3d417865035d", size = 155659780, upload-time = "2025-07-30T19:58:51.171Z" }, - { url = "https://files.pythonhosted.org/packages/12/34/1251beb5a3cb93f3950ebe68732752014646003ef6eb11eb5f1a37ca78cd/triton-3.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e5c1442eaeabae2e2452ae765801bd53cd4ce873cab0d1bdd59a32ab2d9397", size = 155430799, upload-time = "2025-07-30T19:58:57.664Z" }, ] [[package]] @@ -3409,29 +2916,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] -[[package]] -name = "urllib3" -version = "1.26.20" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" }, -] - [[package]] name = "urllib3" version = "2.5.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version == '3.13.*'", - "python_full_version == '3.12.*'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, @@ -3488,10 +2976,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, - { url = "https://files.pythonhosted.org/packages/43/46/dd0791943613885f62619f18ee6107e6133237a6b6ed8a9ecfac339d0b4f/wrapt-1.17.3-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7e18f01b0c3e4a07fe6dfdb00e29049ba17eadbc5e7609a2a3a4af83ab7d710a", size = 81745, upload-time = "2025-08-12T05:52:49.62Z" }, - { url = "https://files.pythonhosted.org/packages/dd/ec/bb2d19bd1a614cc4f438abac13ae26c57186197920432d2a915183b15a8b/wrapt-1.17.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f5f51a6466667a5a356e6381d362d259125b57f059103dd9fdc8c0cf1d14139", size = 82833, upload-time = "2025-08-12T05:52:27.738Z" }, - { url = "https://files.pythonhosted.org/packages/8d/eb/66579aea6ad36f07617fedca8e282e49c7c9bab64c63b446cfe4f7f47a49/wrapt-1.17.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:59923aa12d0157f6b82d686c3fd8e1166fa8cdfb3e17b42ce3b6147ff81528df", size = 81889, upload-time = "2025-08-12T05:52:29.023Z" }, - { url = "https://files.pythonhosted.org/packages/04/9c/a56b5ac0e2473bdc3fb11b22dd69ff423154d63861cf77911cdde5e38fd2/wrapt-1.17.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46acc57b331e0b3bcb3e1ca3b421d65637915cfcd65eb783cb2f78a511193f9b", size = 81344, upload-time = "2025-08-12T05:52:50.869Z" }, { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, ] @@ -3606,21 +3090,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" }, { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" }, { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" }, - { url = "https://files.pythonhosted.org/packages/03/ff/1b4bb3f397552116c1df6266c1b83a21aeeb26061ab1f462984b499a3870/xxhash-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cc604dc06027dbeb8281aeac5899c35fcfe7c77b25212833709f0bff4ce74d2a", size = 32844, upload-time = "2025-10-02T14:36:39.157Z" }, - { url = "https://files.pythonhosted.org/packages/c1/db/27146d0bee4346a9a31f7b498a81fc02747f6f1e6c52a2e7989504278051/xxhash-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:277175a73900ad43a8caeb8b99b9604f21fe8d7c842f2f9061a364a7e220ddb7", size = 30806, upload-time = "2025-10-02T14:36:40.621Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2b/4896188df564908817a75de19bf7f2384b99a75af2d528f9c49326f76458/xxhash-3.6.0-cp39-cp39-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cfbc5b91397c8c2972fdac13fb3e4ed2f7f8ccac85cd2c644887557780a9b6e2", size = 193448, upload-time = "2025-10-02T14:36:41.797Z" }, - { url = "https://files.pythonhosted.org/packages/51/c5/be8953f62e772340319a826ce1e07489935600089756cf83b628cd36ebe3/xxhash-3.6.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2762bfff264c4e73c0e507274b40634ff465e025f0eaf050897e88ec8367575d", size = 212547, upload-time = "2025-10-02T14:36:43.581Z" }, - { url = "https://files.pythonhosted.org/packages/51/1a/1e9f0b911d1cf00dd537c074ae3fae15b535a7f0d9e7edd42a9d2c4f78ce/xxhash-3.6.0-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2f171a900d59d51511209f7476933c34a0c2c711078d3c80e74e0fe4f38680ec", size = 211309, upload-time = "2025-10-02T14:36:45.307Z" }, - { url = "https://files.pythonhosted.org/packages/63/88/b284c6a128d88dc47f201957f926e707db79fb7415a87072e15c0e490de0/xxhash-3.6.0-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:780b90c313348f030b811efc37b0fa1431163cb8db8064cf88a7936b6ce5f222", size = 444480, upload-time = "2025-10-02T14:36:47.226Z" }, - { url = "https://files.pythonhosted.org/packages/87/e4/798293a2bf9e4fac5f6d53ce59cba4739930778dfc6c7c73f40044ab0e6e/xxhash-3.6.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b242455eccdfcd1fa4134c431a30737d2b4f045770f8fe84356b3469d4b919", size = 192957, upload-time = "2025-10-02T14:36:48.968Z" }, - { url = "https://files.pythonhosted.org/packages/78/55/bfd0d7db447a927897469048b953caececa3532e743b940dd1f5c1032d24/xxhash-3.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a75ffc1bd5def584129774c158e108e5d768e10b75813f2b32650bb041066ed6", size = 209850, upload-time = "2025-10-02T14:36:50.258Z" }, - { url = "https://files.pythonhosted.org/packages/31/06/d08ef9a792bfebfd2fb2bcbf04a541ad283bef74749ead6f089a0809d288/xxhash-3.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:1fc1ed882d1e8df932a66e2999429ba6cc4d5172914c904ab193381fba825360", size = 197342, upload-time = "2025-10-02T14:36:51.651Z" }, - { url = "https://files.pythonhosted.org/packages/7b/1a/aebf90797c94e9ca407c28e23f54d71f7149d91a93406a08a09e44d06994/xxhash-3.6.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:44e342e8cc11b4e79dae5c57f2fb6360c3c20cc57d32049af8f567f5b4bcb5f4", size = 209757, upload-time = "2025-10-02T14:36:53.009Z" }, - { url = "https://files.pythonhosted.org/packages/3c/80/799eec3d0a144dc3edf8c19b4f139c27fb923c50b34352796089ca206429/xxhash-3.6.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c2f9ccd5c4be370939a2e17602fbc49995299203da72a3429db013d44d590e86", size = 412773, upload-time = "2025-10-02T14:36:54.691Z" }, - { url = "https://files.pythonhosted.org/packages/6a/f9/09df7545699de09219a205123b8463ce9ea83f48acc7aeeba0269507f9d3/xxhash-3.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:02ea4cb627c76f48cd9fb37cf7ab22bd51e57e1b519807234b473faebe526796", size = 190357, upload-time = "2025-10-02T14:36:56.363Z" }, - { url = "https://files.pythonhosted.org/packages/07/40/2f8327f94e64a3f34d6ce3347c55207c322abbc80ae486ea45df4c62e7b3/xxhash-3.6.0-cp39-cp39-win32.whl", hash = "sha256:6551880383f0e6971dc23e512c9ccc986147ce7bfa1cd2e4b520b876c53e9f3d", size = 30585, upload-time = "2025-10-02T14:36:57.664Z" }, - { url = "https://files.pythonhosted.org/packages/6a/c8/2ecbc6799be9c02e8bf7b5a66cd94832b6ac13d59808746f0d402481c6ad/xxhash-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:7c35c4cdc65f2a29f34425c446f2f5cdcd0e3c34158931e1cc927ece925ab802", size = 31512, upload-time = "2025-10-02T14:36:58.837Z" }, - { url = "https://files.pythonhosted.org/packages/19/94/1d5459a9c587c94d7b8bcc710bd08bbfa145cbd814ebde41b48494362a21/xxhash-3.6.0-cp39-cp39-win_arm64.whl", hash = "sha256:ffc578717a347baf25be8397cb10d2528802d24f94cfc005c0e44fef44b5cdd6", size = 27878, upload-time = "2025-10-02T14:37:00.201Z" }, { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" }, { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" }, { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" }, @@ -3633,9 +3102,9 @@ name = "yarl" version = "1.20.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "idna", marker = "python_full_version >= '3.10'" }, - { name = "multidict", marker = "python_full_version >= '3.10'" }, - { name = "propcache", marker = "python_full_version >= '3.10'" }, + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3c/fb/efaa23fa4e45537b827620f04cf8f3cd658b76642205162e072703a5b963/yarl-1.20.1.tar.gz", hash = "sha256:d017a4997ee50c91fd5466cef416231bb82177b93b029906cefc542ce14c35ac", size = 186428, upload-time = "2025-06-10T00:46:09.923Z" } wheels = [ @@ -3724,31 +3193,5 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/ed/c5fb04869b99b717985e244fd93029c7a8e8febdfcffa06093e32d7d44e7/yarl-1.20.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:88cab98aa4e13e1ade8c141daeedd300a4603b7132819c484841bb7af3edce9e", size = 341709, upload-time = "2025-06-10T00:45:23.221Z" }, { url = "https://files.pythonhosted.org/packages/24/fd/725b8e73ac2a50e78a4534ac43c6addf5c1c2d65380dd48a9169cc6739a9/yarl-1.20.1-cp313-cp313t-win32.whl", hash = "sha256:b121ff6a7cbd4abc28985b6028235491941b9fe8fe226e6fdc539c977ea1739d", size = 86591, upload-time = "2025-06-10T00:45:25.793Z" }, { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, - { url = "https://files.pythonhosted.org/packages/01/75/0d37402d208d025afa6b5b8eb80e466d267d3fd1927db8e317d29a94a4cb/yarl-1.20.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e42ba79e2efb6845ebab49c7bf20306c4edf74a0b20fc6b2ccdd1a219d12fad3", size = 134259, upload-time = "2025-06-10T00:45:29.882Z" }, - { url = "https://files.pythonhosted.org/packages/73/84/1fb6c85ae0cf9901046f07d0ac9eb162f7ce6d95db541130aa542ed377e6/yarl-1.20.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:41493b9b7c312ac448b7f0a42a089dffe1d6e6e981a2d76205801a023ed26a2b", size = 91269, upload-time = "2025-06-10T00:45:32.917Z" }, - { url = "https://files.pythonhosted.org/packages/f3/9c/eae746b24c4ea29a5accba9a06c197a70fa38a49c7df244e0d3951108861/yarl-1.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f5a5928ff5eb13408c62a968ac90d43f8322fd56d87008b8f9dabf3c0f6ee983", size = 89995, upload-time = "2025-06-10T00:45:35.066Z" }, - { url = "https://files.pythonhosted.org/packages/fb/30/693e71003ec4bc1daf2e4cf7c478c417d0985e0a8e8f00b2230d517876fc/yarl-1.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:30c41ad5d717b3961b2dd785593b67d386b73feca30522048d37298fee981805", size = 325253, upload-time = "2025-06-10T00:45:37.052Z" }, - { url = "https://files.pythonhosted.org/packages/0f/a2/5264dbebf90763139aeb0b0b3154763239398400f754ae19a0518b654117/yarl-1.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:59febc3969b0781682b469d4aca1a5cab7505a4f7b85acf6db01fa500fa3f6ba", size = 320897, upload-time = "2025-06-10T00:45:39.962Z" }, - { url = "https://files.pythonhosted.org/packages/e7/17/77c7a89b3c05856489777e922f41db79ab4faf58621886df40d812c7facd/yarl-1.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2b6fb3622b7e5bf7a6e5b679a69326b4279e805ed1699d749739a61d242449e", size = 340696, upload-time = "2025-06-10T00:45:41.915Z" }, - { url = "https://files.pythonhosted.org/packages/6d/55/28409330b8ef5f2f681f5b478150496ec9cf3309b149dab7ec8ab5cfa3f0/yarl-1.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:749d73611db8d26a6281086f859ea7ec08f9c4c56cec864e52028c8b328db723", size = 335064, upload-time = "2025-06-10T00:45:43.893Z" }, - { url = "https://files.pythonhosted.org/packages/85/58/cb0257cbd4002828ff735f44d3c5b6966c4fd1fc8cc1cd3cd8a143fbc513/yarl-1.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9427925776096e664c39e131447aa20ec738bdd77c049c48ea5200db2237e000", size = 327256, upload-time = "2025-06-10T00:45:46.393Z" }, - { url = "https://files.pythonhosted.org/packages/53/f6/c77960370cfa46f6fb3d6a5a79a49d3abfdb9ef92556badc2dcd2748bc2a/yarl-1.20.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff70f32aa316393eaf8222d518ce9118148eddb8a53073c2403863b41033eed5", size = 316389, upload-time = "2025-06-10T00:45:48.358Z" }, - { url = "https://files.pythonhosted.org/packages/64/ab/be0b10b8e029553c10905b6b00c64ecad3ebc8ace44b02293a62579343f6/yarl-1.20.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c7ddf7a09f38667aea38801da8b8d6bfe81df767d9dfc8c88eb45827b195cd1c", size = 340481, upload-time = "2025-06-10T00:45:50.663Z" }, - { url = "https://files.pythonhosted.org/packages/c5/c3/3f327bd3905a4916029bf5feb7f86dcf864c7704f099715f62155fb386b2/yarl-1.20.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:57edc88517d7fc62b174fcfb2e939fbc486a68315d648d7e74d07fac42cec240", size = 336941, upload-time = "2025-06-10T00:45:52.554Z" }, - { url = "https://files.pythonhosted.org/packages/d1/42/040bdd5d3b3bb02b4a6ace4ed4075e02f85df964d6e6cb321795d2a6496a/yarl-1.20.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:dab096ce479d5894d62c26ff4f699ec9072269d514b4edd630a393223f45a0ee", size = 339936, upload-time = "2025-06-10T00:45:54.919Z" }, - { url = "https://files.pythonhosted.org/packages/0d/1c/911867b8e8c7463b84dfdc275e0d99b04b66ad5132b503f184fe76be8ea4/yarl-1.20.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:14a85f3bd2d7bb255be7183e5d7d6e70add151a98edf56a770d6140f5d5f4010", size = 360163, upload-time = "2025-06-10T00:45:56.87Z" }, - { url = "https://files.pythonhosted.org/packages/e2/31/8c389f6c6ca0379b57b2da87f1f126c834777b4931c5ee8427dd65d0ff6b/yarl-1.20.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:2c89b5c792685dd9cd3fa9761c1b9f46fc240c2a3265483acc1565769996a3f8", size = 359108, upload-time = "2025-06-10T00:45:58.869Z" }, - { url = "https://files.pythonhosted.org/packages/7f/09/ae4a649fb3964324c70a3e2b61f45e566d9ffc0affd2b974cbf628957673/yarl-1.20.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:69e9b141de5511021942a6866990aea6d111c9042235de90e08f94cf972ca03d", size = 351875, upload-time = "2025-06-10T00:46:01.45Z" }, - { url = "https://files.pythonhosted.org/packages/8d/43/bbb4ed4c34d5bb62b48bf957f68cd43f736f79059d4f85225ab1ef80f4b9/yarl-1.20.1-cp39-cp39-win32.whl", hash = "sha256:b5f307337819cdfdbb40193cad84978a029f847b0a357fbe49f712063cfc4f06", size = 82293, upload-time = "2025-06-10T00:46:03.763Z" }, - { url = "https://files.pythonhosted.org/packages/d7/cd/ce185848a7dba68ea69e932674b5c1a42a1852123584bccc5443120f857c/yarl-1.20.1-cp39-cp39-win_amd64.whl", hash = "sha256:eae7bfe2069f9c1c5b05fc7fe5d612e5bbc089a39309904ee8b829e322dcad00", size = 87385, upload-time = "2025-06-10T00:46:05.655Z" }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] - -[[package]] -name = "zipp" -version = "3.23.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] From 6ba89d51ae49cdfc0e8e99c7c441470ea87223ca Mon Sep 17 00:00:00 2001 From: Brendan Clement Date: Fri, 19 Jun 2026 18:41:58 -0400 Subject: [PATCH 155/177] feat: support hamming clustering (#7379) Add SIMD-accelerated pairwise hamming distance over 64-bit binary hashes, plus union-find clustering to group binary vectors within a hamming-distance threshold (near-duplicate detection). - lance-linalg: pairwise_hamming_distance[_parallel] with AVX-512 / AVX2 / scalar kernels, PairwiseResult, UnionFind, Cluster/ClusteringResult, extract_hashes_from_fixed_list, cluster_edges/cluster_pairwise_result. - lance: hamming_clustering_for_ivf_partition / for_sample / for_range / from_hashes and get_ivf_partition_info, returning a RecordBatchReader of (representative, duplicates) clusters. - python: thin bindings + wrappers in lance.vector, type stubs, and a test. Recreates #6265 (originally authored by Jack Ye) on top of current main, updating imports/signatures for upstream API drift. Co-authored-by: Jack Ye --- Cargo.lock | 1 + python/Cargo.lock | 1 + python/python/lance/lance/__init__.pyi | 21 + python/python/lance/vector.py | 147 +++ python/python/tests/test_vector.py | 37 +- python/src/dataset.rs | 182 +++ rust/lance-linalg/Cargo.toml | 5 +- rust/lance-linalg/benches/hamming.rs | 52 - rust/lance-linalg/src/distance.rs | 6 +- rust/lance-linalg/src/distance/hamming.rs | 1323 ++++++++++++++++++++- rust/lance/Cargo.toml | 4 + rust/lance/benches/hamming.rs | 228 ++++ rust/lance/src/index/vector.rs | 1 + rust/lance/src/index/vector/hamming.rs | 938 +++++++++++++++ 14 files changed, 2885 insertions(+), 61 deletions(-) delete mode 100644 rust/lance-linalg/benches/hamming.rs create mode 100644 rust/lance/benches/hamming.rs create mode 100644 rust/lance/src/index/vector/hamming.rs diff --git a/Cargo.lock b/Cargo.lock index 89d20bdf647..11a5fb65a7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4905,6 +4905,7 @@ dependencies = [ "num-traits", "proptest", "rand 0.9.4", + "rayon", ] [[package]] diff --git a/python/Cargo.lock b/python/Cargo.lock index 01d2edda1c8..126714795cc 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4488,6 +4488,7 @@ dependencies = [ "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 74db076db41..26ad75a27b7 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -463,6 +463,27 @@ class _Dataset: def get_transactions( self, recent_transactions=10 ) -> List[Optional[Transaction]]: ... + def hamming_clustering_for_ivf_partition( + self, + index_name: str, + partition_id: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def get_ivf_partition_info(self, index_name: str) -> List[dict]: ... + def hamming_clustering_for_sample( + self, + column: str, + sample_size: Optional[int], + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... + def hamming_clustering_for_range( + self, + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int, + ) -> pa.RecordBatchReader: ... class _MergeInsertBuilder: def __init__(self, dataset: _Dataset, on: str | Iterable[str]): ... diff --git a/python/python/lance/vector.py b/python/python/lance/vector.py index 34a6154a321..5ce5e8b61e5 100644 --- a/python/python/lance/vector.py +++ b/python/python/lance/vector.py @@ -749,3 +749,150 @@ def _partition_and_pq_codes_assignment() -> Iterable[pa.RecordBatch]: data_file.path for frag in ds.get_fragments() for data_file in frag.data_files() ] return dst_dataset_uri, shuffle_buffers + + +# ============================================================================= +# Hamming Distance Clustering +# ============================================================================= + + +def hamming_clustering_for_ivf_partition( + dataset: "LanceDataset", + index_name: str, + partition_id: int, + hamming_threshold: int, +) -> pa.RecordBatchReader: + """ + Perform hamming clustering on a partition of an IVF_FLAT index. + + Loads a partition from an IVF_FLAT index on a hash column, computes + pairwise hamming distances between all hashes in the partition, + filters by threshold, and clusters the results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index on the hash column + partition_id : int + The partition ID within the IVF_FLAT index + hamming_threshold : int + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_ivf_partition( + index_name, partition_id, hamming_threshold + ) + + +def get_ivf_partition_info( + dataset: "LanceDataset", + index_name: str, +) -> List[dict]: + """ + Get partition information for an IVF_FLAT index. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column with an IVF_FLAT index. + index_name : str + Name of the IVF_FLAT index + + Returns + ------- + list[dict] + List of partition info dicts with 'partition_id' and 'size' + """ + return dataset._ds.get_ivf_partition_info(index_name) + + +def hamming_clustering_for_sample( + dataset: "LanceDataset", + column: str, + sample_size: Optional[int] = None, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a sample of the dataset. + + Randomly samples rows from the dataset, computes pairwise hamming distances + between all hashes in the sample, filters by threshold, and clusters the + results using union-find. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + sample_size : int, optional + Number of rows to sample. If None, uses all rows. + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_sample( + column, sample_size, hamming_threshold + ) + + +def hamming_clustering_for_range( + dataset: "LanceDataset", + column: str, + fragment_id: int, + start_row: int, + num_rows: int, + hamming_threshold: int = 10, +) -> pa.RecordBatchReader: + """ + Perform pairwise hamming distance clustering on a contiguous range of rows. + + Reads a contiguous range of rows from a specific fragment, computes pairwise + hamming distances between all hashes in the range, filters by threshold, + and clusters the results using union-find. + + Unlike sampling, this reads sequential rows which is useful for distributed + processing where each worker handles a specific range of a fragment. + + Parameters + ---------- + dataset : LanceDataset + The Lance dataset containing the hash column. + column : str + Name of the hash column (must be FixedSizeList) + fragment_id : int + The fragment ID to read from + start_row : int + The starting row offset within the fragment + num_rows : int + Number of rows to read from the start position + hamming_threshold : int, default 10 + Maximum hamming distance to consider as similar + + Returns + ------- + pa.RecordBatchReader + A reader yielding batches with columns: + + - 'representative': uint64 - The representative row ID for each cluster + - 'duplicates': list - List of duplicate row IDs in each cluster + """ + return dataset._ds.hamming_clustering_for_range( + column, fragment_id, start_row, num_rows, hamming_threshold + ) diff --git a/python/python/tests/test_vector.py b/python/python/tests/test_vector.py index c02c8312f88..4ea4e7d425e 100644 --- a/python/python/tests/test_vector.py +++ b/python/python/tests/test_vector.py @@ -5,7 +5,7 @@ import numpy as np import pyarrow as pa import pytest -from lance.vector import vec_to_table +from lance.vector import hamming_clustering_for_sample, vec_to_table def test_dict(): @@ -147,3 +147,38 @@ def test_binary_vectors_invalid_metric(tmp_path): "metric": "l2", } ).to_table() + + +def _hash_table(hashes): + """Build a table with a ``hash`` column of FixedSizeList. + + ``hashes`` is a list of 8-byte sequences, one per row. + """ + flat = [byte for row in hashes for byte in row] + values = pa.FixedSizeListArray.from_arrays( + pa.array(flat, type=pa.uint8()), list_size=8 + ) + return pa.Table.from_arrays([values], names=["hash"]) + + +def test_hamming_clustering_for_sample(tmp_path): + hash_a = [0, 0, 0, 0, 0, 0, 0, 0] + hash_b = [255, 0, 0, 0, 0, 0, 0, 0] # 8 bits from hash_a + hash_c = [1, 2, 3, 4, 5, 6, 7, 8] # far from both + # Rows 0,1,2 share hash_a; rows 3,4 share hash_b; row 5 is unique. + table = _hash_table([hash_a, hash_a, hash_a, hash_b, hash_b, hash_c]) + dataset = lance.write_dataset(table, tmp_path / "hashes") + + # threshold 0 => only exact-match hashes cluster together. Full scan + # (sample_size=None) yields deterministic row ids 0..5. + result = hamming_clustering_for_sample(dataset, "hash", None, 0).read_all() + + clusters = { + rep: sorted(dups) + for rep, dups in zip( + result["representative"].to_pylist(), + result["duplicates"].to_pylist(), + ) + } + # Singleton row 5 is not emitted as a cluster. + assert clusters == {0: [1, 2], 3: [4]} diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 8bfa81aeae4..31eaa96a654 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -3428,6 +3428,188 @@ impl Dataset { self.ds.clone(), )) } + + /// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. + /// + /// This function loads a specific partition from an IVF_FLAT index on a hash column, + /// computes pairwise hamming distances between all hashes in the partition, + /// filters by threshold, and clusters the results using union-find. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index on the hash column + /// partition_id : int + /// The partition ID within the IVF_FLAT index + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (index_name, partition_id, hamming_threshold))] + fn hamming_clustering_for_ivf_partition( + &self, + py: Python<'_>, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_ivf_partition; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_ivf_partition( + ds, + index_name, + partition_id, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Get partition information for an IVF_FLAT index. + /// + /// Parameters + /// ---------- + /// index_name : str + /// Name of the IVF_FLAT index + /// + /// Returns + /// ------- + /// List[dict] + /// List of partition info dicts with 'partition_id' and 'size' + #[pyo3(signature = (index_name))] + fn get_ivf_partition_info( + &self, + py: Python<'_>, + index_name: &str, + ) -> PyResult>> { + use lance::index::vector::hamming::get_ivf_partition_info; + + let ds = self.ds.as_ref(); + let result = rt() + .block_on(Some(py), get_ivf_partition_info(ds, index_name))? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + let partitions: PyResult> = result + .iter() + .map(|p| { + let dict = PyDict::new(py); + dict.set_item("partition_id", p.partition_id)?; + dict.set_item("size", p.size)?; + Ok(dict.into()) + }) + .collect(); + + partitions + } + + /// Perform pairwise hamming distance clustering on sampled rows from a dataset. + /// + /// This function samples N rows randomly from the dataset, extracts hashes, + /// computes pairwise hamming distances, and clusters the results. + /// It's useful for benchmarking and testing without requiring an IVF index. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// sample_size : int, optional + /// Number of rows to sample (if None or >= total rows, uses all rows) + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, sample_size, hamming_threshold))] + fn hamming_clustering_for_sample( + &self, + py: Python<'_>, + column: &str, + sample_size: Option, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_sample; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_sample(ds, column, sample_size, hamming_threshold), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } + + /// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. + /// + /// This function reads a contiguous range of rows from a specific fragment, + /// extracts hashes, computes pairwise hamming distances, and clusters the results. + /// Unlike sampling, this reads sequential rows which is useful for distributed + /// processing where each worker handles a specific range of a fragment. + /// + /// Parameters + /// ---------- + /// column : str + /// Name of the hash column (must be FixedSizeList) + /// fragment_id : int + /// The fragment ID to read from + /// start_row : int + /// The starting row offset within the fragment + /// num_rows : int + /// Number of rows to read from the start position + /// hamming_threshold : int + /// Maximum hamming distance to consider as similar + /// + /// Returns + /// ------- + /// pyarrow.RecordBatchReader + /// A reader yielding batches with columns: + /// - 'representative': uint64 - The representative row ID for each cluster + /// - 'duplicates': list - List of duplicate row IDs in each cluster + #[pyo3(signature = (column, fragment_id, start_row, num_rows, hamming_threshold))] + fn hamming_clustering_for_range( + &self, + py: Python<'_>, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, + ) -> PyResult>> { + use lance::index::vector::hamming::hamming_clustering_for_range; + + let ds = self.ds.as_ref(); + let reader = rt() + .block_on( + Some(py), + hamming_clustering_for_range( + ds, + column, + fragment_id, + start_row, + num_rows, + hamming_threshold, + ), + )? + .map_err(|err| PyValueError::new_err(err.to_string()))?; + + Ok(PyArrowType(reader)) + } } #[pyclass(name = "SqlQuery", module = "_lib", subclass, skip_from_py_object)] diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml index cf91deb69d7..6a188ec3c62 100644 --- a/rust/lance-linalg/Cargo.toml +++ b/rust/lance-linalg/Cargo.toml @@ -18,6 +18,7 @@ lance-arrow = { workspace = true } lance-core = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } +rayon = { workspace = true } [dev-dependencies] approx = { workspace = true } @@ -50,10 +51,6 @@ harness = false name = "cosine" harness = false -[[bench]] -name = "hamming" -harness = false - [[bench]] name = "norm_l2" harness = false diff --git a/rust/lance-linalg/benches/hamming.rs b/rust/lance-linalg/benches/hamming.rs deleted file mode 100644 index 9af3bf4614b..00000000000 --- a/rust/lance-linalg/benches/hamming.rs +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -use std::iter::repeat_with; - -use std::hint::black_box; - -use criterion::{Criterion, criterion_group, criterion_main}; -use lance_linalg::distance::hamming::{hamming, hamming_scalar}; -use rand::Rng; - -const DIMENSION: usize = 1024; -const TOTAL: usize = 1024 * 1024; // 1M vectors - -fn bench_hamming(c: &mut Criterion) { - let mut rng = rand::rng(); - - let key = repeat_with(|| rng.random::()) - .take(DIMENSION) - .collect::>(); - let target = repeat_with(|| rng.random::()) - .take(TOTAL * DIMENSION) - .collect::>(); - - c.bench_function("hamming,scalar", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming_scalar(&key, tgt)) - .sum::(), - ); - }) - }); - - c.bench_function("hamming,auto_vec", |b| { - b.iter(|| { - black_box( - target - .chunks_exact(DIMENSION) - .map(|tgt| hamming(&key, tgt)) - .sum::(), - ); - }) - }); -} - -criterion_group!( - name=benches; - config = Criterion::default().significance_level(0.1).sample_size(10); - targets = bench_hamming); -criterion_main!(benches); diff --git a/rust/lance-linalg/src/distance.rs b/rust/lance-linalg/src/distance.rs index a356d5c1225..23d1cae2d63 100644 --- a/rust/lance-linalg/src/distance.rs +++ b/rust/lance-linalg/src/distance.rs @@ -27,7 +27,11 @@ pub mod norm_l2; pub use cosine::*; pub use dot::*; -use hamming::hamming_distance_arrow_batch; +pub use hamming::{ + Cluster, ClusteringResult, PairwiseResult, UnionFind, cluster_edges, cluster_pairwise_result, + extract_hashes_from_fixed_list, hamming_distance_arrow_batch, hamming_u64, + pairwise_hamming_distance, pairwise_hamming_distance_parallel, +}; pub use l2::*; use lance_core::deepsize::DeepSizeOf; pub use norm_l2::*; diff --git a/rust/lance-linalg/src/distance/hamming.rs b/rust/lance-linalg/src/distance/hamming.rs index d8fd60f4054..a6f4b038195 100644 --- a/rust/lance-linalg/src/distance/hamming.rs +++ b/rust/lance-linalg/src/distance/hamming.rs @@ -2,14 +2,24 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors //! Hamming distance. +//! +//! This module provides hamming distance computation for binary vectors, +//! including SIMD-accelerated pairwise hamming distance for 64-bit hashes. +use std::collections::HashMap; use std::sync::Arc; -use crate::{Error, Result}; +use arrow_array::builder::{ListBuilder, UInt64Builder}; use arrow_array::cast::AsArray; use arrow_array::types::UInt8Type; -use arrow_array::{Array, FixedSizeListArray, Float32Array}; -use arrow_schema::DataType; +use arrow_array::{ + Array, ArrayRef, FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator, + RecordBatchReader, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use rayon::prelude::*; + +use crate::{Error, Result}; pub trait Hamming { /// Hamming distance between two vectors. @@ -86,6 +96,640 @@ pub fn hamming_distance_arrow_batch( ))) } +/// Compute hamming distance between two 64-bit values using POPCNT. +#[inline(always)] +pub fn hamming_u64(a: u64, b: u64) -> u32 { + (a ^ b).count_ones() +} + +/// Result of pairwise hamming distance computation. +#[derive(Debug, Clone)] +pub struct PairwiseResult { + pub row_id_a: Vec, + pub row_id_b: Vec, + pub distances: Vec, +} + +impl PairwiseResult { + pub fn new() -> Self { + Self { + row_id_a: Vec::new(), + row_id_b: Vec::new(), + distances: Vec::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + row_id_a: Vec::with_capacity(capacity), + row_id_b: Vec::with_capacity(capacity), + distances: Vec::with_capacity(capacity), + } + } + + pub fn push(&mut self, a: u64, b: u64, dist: u32) { + self.row_id_a.push(a); + self.row_id_b.push(b); + self.distances.push(dist); + } + + pub fn len(&self) -> usize { + self.row_id_a.len() + } + + pub fn is_empty(&self) -> bool { + self.row_id_a.is_empty() + } + + pub fn extend(&mut self, other: Self) { + self.row_id_a.extend(other.row_id_a); + self.row_id_b.extend(other.row_id_b); + self.distances.extend(other.distances); + } + + /// Convert to Arrow RecordBatch, consuming self. + pub fn into_record_batch(self) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("row_id_a", DataType::UInt64, false), + Field::new("row_id_b", DataType::UInt64, false), + Field::new("distance", DataType::UInt32, false), + ])); + + let row_id_a = Arc::new(UInt64Array::from(self.row_id_a)); + let row_id_b = Arc::new(UInt64Array::from(self.row_id_b)); + let distances = Arc::new(UInt32Array::from(self.distances)); + + RecordBatch::try_new(schema, vec![row_id_a, row_id_b, distances]) + .expect("Failed to create RecordBatch") + } +} + +impl Default for PairwiseResult { + fn default() -> Self { + Self::new() + } +} + +/// Compute hamming distances for a query against multiple targets. +/// Uses SIMD acceleration when available. +#[inline] +pub fn hamming_batch_u64(query: u64, targets: &[u64], results: &mut [u32]) { + debug_assert_eq!(targets.len(), results.len()); + hamming_batch_simd(query, targets, results); +} + +/// SIMD-accelerated batch hamming distance computation. +#[inline] +fn hamming_batch_simd(query: u64, targets: &[u64], results: &mut [u32]) { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512f") { + unsafe { + hamming_batch_avx512(query, targets, results); + } + return; + } + if is_x86_feature_detected!("avx2") { + unsafe { + hamming_batch_avx2(query, targets, results); + } + return; + } + } + + // Scalar fallback (LLVM auto-vectorizes well on Apple Silicon) + hamming_batch_scalar(query, targets, results); +} + +/// Scalar fallback using count_ones() which compiles to POPCNT. +#[inline] +fn hamming_batch_scalar(query: u64, targets: &[u64], results: &mut [u32]) { + // Unroll for better auto-vectorization + let n = targets.len(); + let chunks = n / 8; + let mut i = 0; + + for _ in 0..chunks { + results[i] = (query ^ targets[i]).count_ones(); + results[i + 1] = (query ^ targets[i + 1]).count_ones(); + results[i + 2] = (query ^ targets[i + 2]).count_ones(); + results[i + 3] = (query ^ targets[i + 3]).count_ones(); + results[i + 4] = (query ^ targets[i + 4]).count_ones(); + results[i + 5] = (query ^ targets[i + 5]).count_ones(); + results[i + 6] = (query ^ targets[i + 6]).count_ones(); + results[i + 7] = (query ^ targets[i + 7]).count_ones(); + i += 8; + } + + // Handle remainder + while i < n { + results[i] = (query ^ targets[i]).count_ones(); + i += 1; + } +} + +/// AVX-512 VPOPCNTDQ: Process 8 x 64-bit values at once. +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")] +unsafe fn hamming_batch_avx512(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + let query_vec = _mm512_set1_epi64(query as i64); + + let chunks = n / 8; + let remainder = n % 8; + + for i in 0..chunks { + let offset = i * 8; + let targets_ptr = targets.as_ptr().add(offset) as *const __m512i; + let target_vec = _mm512_loadu_si512(targets_ptr); + + let xor_result = _mm512_xor_si512(query_vec, target_vec); + let popcount = _mm512_popcnt_epi64(xor_result); + let popcount_32 = _mm512_cvtepi64_epi32(popcount); + + _mm256_storeu_si256( + results.as_mut_ptr().add(offset) as *mut __m256i, + popcount_32, + ); + } + + if remainder > 0 { + let offset = chunks * 8; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// AVX2 popcount using lookup table (Harley-Seal / PSHUFB method). +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn hamming_batch_avx2(query: u64, targets: &[u64], results: &mut [u32]) { + use std::arch::x86_64::*; + + let n = targets.len(); + + let lookup = _mm256_setr_epi8( + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, + 3, 4, + ); + let low_mask = _mm256_set1_epi8(0x0f); + let query_vec = _mm256_set1_epi64x(query as i64); + + let chunks = n / 4; + let remainder = n % 4; + + for i in 0..chunks { + let offset = i * 4; + let targets_ptr = targets.as_ptr().add(offset) as *const __m256i; + let target_vec = _mm256_loadu_si256(targets_ptr); + + let xor_result = _mm256_xor_si256(query_vec, target_vec); + + // Popcount using nibble lookup + let lo = _mm256_and_si256(xor_result, low_mask); + let hi = _mm256_and_si256(_mm256_srli_epi16(xor_result, 4), low_mask); + let popcnt_lo = _mm256_shuffle_epi8(lookup, lo); + let popcnt_hi = _mm256_shuffle_epi8(lookup, hi); + let popcnt_bytes = _mm256_add_epi8(popcnt_lo, popcnt_hi); + let popcount = _mm256_sad_epu8(popcnt_bytes, _mm256_setzero_si256()); + + let results_ptr = results.as_mut_ptr().add(offset); + *results_ptr = _mm256_extract_epi32::<0>(popcount) as u32; + *results_ptr.add(1) = _mm256_extract_epi32::<2>(popcount) as u32; + *results_ptr.add(2) = _mm256_extract_epi32::<4>(popcount) as u32; + *results_ptr.add(3) = _mm256_extract_epi32::<6>(popcount) as u32; + } + + if remainder > 0 { + let offset = chunks * 4; + for j in 0..remainder { + results[offset + j] = (query ^ targets[offset + j]).count_ones(); + } + } +} + +/// Compute pairwise hamming distances for all pairs of hashes. +/// +/// Returns pairs where distance <= threshold (if provided). +/// +/// # Arguments +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `threshold` - Optional maximum distance to include in results +pub fn pairwise_hamming_distance( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let num_pairs = n * (n - 1) / 2; + let mut result = PairwiseResult::with_capacity(num_pairs.min(1_000_000)); + + for i in 0..n { + for j in (i + 1)..n { + let dist = hamming_u64(hashes[i], hashes[j]); + if dist <= threshold { + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Compute pairwise hamming distances in parallel using rayon + SIMD. +/// +/// Uses chunked parallelization for balanced workload distribution. +pub fn pairwise_hamming_distance_parallel( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: Option, +) -> PairwiseResult { + let n = hashes.len(); + if n < 2 { + return PairwiseResult::new(); + } + + let threshold = threshold.unwrap_or(u32::MAX); + let total_pairs = n * (n - 1) / 2; + + // For small datasets, use sequential to avoid thread overhead + if total_pairs < 10_000 { + return pairwise_hamming_distance(hashes, row_ids, Some(threshold)); + } + + let threads = rayon::current_num_threads(); + let pairs_per_chunk = total_pairs.div_ceil(threads); + let chunks = compute_balanced_chunks(n, pairs_per_chunk); + + let results: Vec = chunks + .into_par_iter() + .map(|(start_row, end_row)| { + process_row_range(hashes, row_ids, threshold, start_row, end_row) + }) + .collect(); + + let mut combined = PairwiseResult::new(); + for r in results { + combined.extend(r); + } + combined +} + +/// Compute balanced chunks for parallel processing. +fn compute_balanced_chunks(n: usize, target_pairs_per_chunk: usize) -> Vec<(usize, usize)> { + let mut chunks = Vec::new(); + let mut current_start = 0; + let mut current_pairs = 0; + + for i in 0..n { + let pairs_for_row = n - i - 1; + current_pairs += pairs_for_row; + + if current_pairs >= target_pairs_per_chunk || i == n - 1 { + chunks.push((current_start, i + 1)); + current_start = i + 1; + current_pairs = 0; + } + } + + chunks +} + +/// Process a range of rows for pairwise comparison using SIMD. +fn process_row_range( + hashes: &[u64], + row_ids: Option<&[u64]>, + threshold: u32, + start_row: usize, + end_row: usize, +) -> PairwiseResult { + let n = hashes.len(); + let mut result = PairwiseResult::new(); + + for i in start_row..end_row { + let remaining = n - i - 1; + if remaining == 0 { + continue; + } + + let mut distances = vec![0u32; remaining]; + hamming_batch_u64(hashes[i], &hashes[i + 1..], &mut distances); + + let id_a = row_ids.map_or(i as u64, |ids| ids[i]); + for (j_offset, &dist) in distances.iter().enumerate() { + if dist <= threshold { + let j = i + 1 + j_offset; + let id_b = row_ids.map_or(j as u64, |ids| ids[j]); + result.push(id_a, id_b, dist); + } + } + } + + result +} + +/// Extract u64 hashes from a FixedSizeList Arrow array. +pub fn extract_hashes_from_fixed_list(array: &FixedSizeListArray) -> Result> { + let list_size = array.value_length(); + if list_size != 8 { + return Err(Error::InvalidArgumentError(format!( + "Expected FixedSizeList with size 8, got size {}", + list_size + ))); + } + + let values = array + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::InvalidArgumentError("Expected UInt8Array values in FixedSizeList".to_string()) + })?; + + let n = array.len(); + let mut hashes = Vec::with_capacity(n); + + for i in 0..n { + let start = i * 8; + let bytes = &values.values()[start..start + 8]; + let mut arr = [0u8; 8]; + arr.copy_from_slice(bytes); + hashes.push(u64::from_le_bytes(arr)); + } + + Ok(hashes) +} + +/// Union-Find data structure with path compression for clustering. +pub struct UnionFind { + parent: HashMap, + rank: HashMap, +} + +impl UnionFind { + pub fn new() -> Self { + Self { + parent: HashMap::new(), + rank: HashMap::new(), + } + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + parent: HashMap::with_capacity(capacity), + rank: HashMap::with_capacity(capacity), + } + } + + /// Find the root of a node with path compression. + pub fn find(&mut self, x: u64) -> u64 { + if let std::collections::hash_map::Entry::Vacant(e) = self.parent.entry(x) { + e.insert(x); + self.rank.insert(x, 0); + return x; + } + + let mut current = x; + let mut path = Vec::new(); + + while self.parent[¤t] != current { + path.push(current); + current = self.parent[¤t]; + } + let root = current; + + for node in path { + self.parent.insert(node, root); + } + + root + } + + /// Union two nodes, using union by rank. + pub fn union(&mut self, a: u64, b: u64) -> bool { + let root_a = self.find(a); + let root_b = self.find(b); + + if root_a == root_b { + return false; + } + + let rank_a = self.rank[&root_a]; + let rank_b = self.rank[&root_b]; + + if rank_a < rank_b { + self.parent.insert(root_a, root_b); + } else if rank_a > rank_b { + self.parent.insert(root_b, root_a); + } else if root_a < root_b { + self.parent.insert(root_b, root_a); + *self.rank.get_mut(&root_a).unwrap() += 1; + } else { + self.parent.insert(root_a, root_b); + *self.rank.get_mut(&root_b).unwrap() += 1; + } + + true + } + + pub fn nodes(&self) -> impl Iterator { + self.parent.keys() + } + + pub fn len(&self) -> usize { + self.parent.len() + } + + pub fn is_empty(&self) -> bool { + self.parent.is_empty() + } +} + +impl Default for UnionFind { + fn default() -> Self { + Self::new() + } +} + +/// A cluster with representative and duplicates. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Cluster { + /// The representative row ID (smallest in the cluster). + pub representative: u64, + /// List of duplicate row IDs (excludes the representative). + pub duplicates: Vec, +} + +impl Cluster { + pub fn size(&self) -> usize { + 1 + self.duplicates.len() + } +} + +/// Result of the clustering operation. +#[derive(Debug, Clone)] +pub struct ClusteringResult { + /// List of clusters, each with a representative and duplicates. + pub clusters: Vec, +} + +impl ClusteringResult { + pub fn num_clusters(&self) -> usize { + self.clusters.len() + } + + pub fn num_duplicates(&self) -> usize { + self.clusters.iter().map(|c| c.duplicates.len()).sum() + } + + pub fn num_unique(&self) -> usize { + self.clusters.len() + } + + /// Get the schema for clustering result batches. + pub fn schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("representative", DataType::UInt64, false), + Field::new( + "duplicates", + DataType::List(Arc::new(Field::new("item", DataType::UInt64, true))), + false, + ), + ])) + } + + /// Convert to Arrow RecordBatch with columns: + /// - `representative`: `UInt64` + /// - `duplicates`: `List` + pub fn to_record_batch(&self) -> RecordBatch { + let schema = Self::schema(); + + let mut representatives = Vec::with_capacity(self.clusters.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in &self.clusters { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(schema, vec![representative_array, duplicates_array]) + .expect("Failed to create RecordBatch") + } + + /// Convert to a RecordBatchReader that yields batches of the specified size. + /// + /// # Arguments + /// * `batch_size` - Number of clusters per batch (default: 10000) + pub fn into_reader(self, batch_size: Option) -> Box { + let batch_size = batch_size.unwrap_or(10_000); + let schema = Self::schema(); + + if self.clusters.is_empty() { + // Return empty reader + let batches: Vec> = vec![]; + return Box::new(RecordBatchIterator::new(batches, schema)); + } + + let batches: Vec> = self + .clusters + .chunks(batch_size) + .map(|chunk| { + let mut representatives = Vec::with_capacity(chunk.len()); + let mut duplicates_builder = ListBuilder::new(UInt64Builder::new()); + + for cluster in chunk { + representatives.push(cluster.representative); + for &dup in &cluster.duplicates { + duplicates_builder.values().append_value(dup); + } + duplicates_builder.append(true); + } + + let representative_array: ArrayRef = Arc::new(UInt64Array::from(representatives)); + let duplicates_array: ArrayRef = Arc::new(duplicates_builder.finish()); + + RecordBatch::try_new(Self::schema(), vec![representative_array, duplicates_array]) + }) + .collect(); + + Box::new(RecordBatchIterator::new(batches, schema)) + } +} + +/// Cluster edges using union-find algorithm. +/// +/// Takes a list of edges (row_id_a, row_id_b) and groups connected nodes +/// into clusters. Each cluster has a representative (smallest row ID) +/// and a list of duplicates. +pub fn cluster_edges(edges: I) -> ClusteringResult +where + I: IntoIterator, +{ + let mut uf = UnionFind::new(); + + for (a, b) in edges { + uf.union(a, b); + } + + let mut clusters_map: HashMap> = HashMap::new(); + let nodes: Vec = uf.nodes().copied().collect(); + + for node in nodes { + let root = uf.find(node); + clusters_map.entry(root).or_default().push(node); + } + + let mut clusters = Vec::new(); + for (_root, mut members) in clusters_map { + members.sort_unstable(); + + if members.len() > 1 { + let representative = *members.iter().min().unwrap(); + let duplicates: Vec = members + .into_iter() + .filter(|&m| m != representative) + .collect(); + + clusters.push(Cluster { + representative, + duplicates, + }); + } + } + + clusters.sort_by_key(|c| c.representative); + + ClusteringResult { clusters } +} + +/// Cluster edges from PairwiseResult. +pub fn cluster_pairwise_result(result: &PairwiseResult) -> ClusteringResult { + let edges = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .map(|(&a, &b)| (a, b)); + + cluster_edges(edges) +} + #[cfg(test)] mod tests { use super::*; @@ -102,4 +746,677 @@ mod tests { let y = vec![0b1101_1010, 0b1010_1010, 0b1010_1001]; assert_eq!(hamming(&x, &y), 2.0); } + + #[test] + fn test_hamming_u64() { + assert_eq!(hamming_u64(0, 0), 0); + assert_eq!(hamming_u64(0, 1), 1); + assert_eq!(hamming_u64(0b1111, 0b0000), 4); + assert_eq!(hamming_u64(u64::MAX, 0), 64); + assert_eq!(hamming_u64(0xAAAAAAAAAAAAAAAA, 0x5555555555555555), 64); + } + + #[test] + fn test_hamming_batch_u64() { + let query = 0u64; + let targets: Vec = (0..128).collect(); + let mut results = vec![0u32; 128]; + + hamming_batch_u64(query, &targets, &mut results); + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[3], 2); // 0b11 has 2 bits set + assert_eq!(results[7], 3); // 0b111 has 3 bits set + } + + #[test] + fn test_pairwise_basic() { + let hashes = vec![0b0000u64, 0b0001, 0b0011, 0b0111]; + let result = pairwise_hamming_distance(&hashes, None, None); + + assert_eq!(result.len(), 6); // C(4,2) = 6 pairs + assert!(result.distances.iter().all(|&d| d <= 3)); + } + + #[test] + fn test_pairwise_with_threshold() { + let hashes = vec![0b0000u64, 0b0001, 0b1111]; + let result = pairwise_hamming_distance(&hashes, None, Some(1)); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 0); + assert_eq!(result.row_id_b[0], 1); + assert_eq!(result.distances[0], 1); + } + + #[test] + fn test_pairwise_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001]; + let row_ids = vec![100u64, 200u64]; + let result = pairwise_hamming_distance(&hashes, Some(&row_ids), None); + + assert_eq!(result.len(), 1); + assert_eq!(result.row_id_a[0], 100); + assert_eq!(result.row_id_b[0], 200); + } + + #[test] + fn test_pairwise_parallel() { + let hashes: Vec = (0..100).collect(); + let result_seq = pairwise_hamming_distance(&hashes, None, None); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, None); + + assert_eq!(result_seq.len(), result_par.len()); + } + + #[test] + fn test_union_find_basic() { + let mut uf = UnionFind::new(); + + assert_eq!(uf.find(1), 1); + assert_eq!(uf.find(2), 2); + assert_eq!(uf.find(3), 3); + + assert!(uf.union(1, 2)); + assert_eq!(uf.find(1), uf.find(2)); + + assert!(uf.union(2, 3)); + assert_eq!(uf.find(1), uf.find(3)); + + assert!(!uf.union(1, 3)); + } + + #[test] + fn test_cluster_edges_simple() { + let edges = vec![(1, 2), (2, 3), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 1) + .unwrap(); + assert_eq!(c1.duplicates.len(), 2); + assert!(c1.duplicates.contains(&2)); + assert!(c1.duplicates.contains(&3)); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 4) + .unwrap(); + assert_eq!(c2.duplicates.len(), 1); + assert!(c2.duplicates.contains(&5)); + } + + #[test] + fn test_cluster_pairwise_result() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; // distances: (0,1)=1, (0,2)=2, (1,2)=1 + let pairwise = pairwise_hamming_distance(&hashes, None, Some(1)); // threshold 1 + + // Only pairs with distance <= 1: (0,1) and (1,2) + assert_eq!(pairwise.len(), 2); + + let clustering = cluster_pairwise_result(&pairwise); + // All three should be in one cluster since 0-1-2 are connected + assert_eq!(clustering.num_clusters(), 1); + assert_eq!(clustering.clusters[0].representative, 0); + assert_eq!(clustering.clusters[0].duplicates.len(), 2); + } + + #[test] + fn test_into_record_batch() { + let hashes = vec![0b0000u64, 0b0001, 0b0011]; + let result = pairwise_hamming_distance(&hashes, None, None); + let batch = result.into_record_batch(); + + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.schema().field(0).name(), "row_id_a"); + assert_eq!(batch.schema().field(1).name(), "row_id_b"); + assert_eq!(batch.schema().field(2).name(), "distance"); + } + + // ========================================================================= + // Additional tests from pairwise-hamming reference implementation + // ========================================================================= + + /// Reference implementation for validation - simple O(n²) nested loop + fn reference_pairwise(hashes: &[u64], threshold: Option) -> Vec<(usize, usize, u32)> { + let threshold = threshold.unwrap_or(u32::MAX); + let mut results = Vec::new(); + for i in 0..hashes.len() { + for j in (i + 1)..hashes.len() { + let dist = (hashes[i] ^ hashes[j]).count_ones(); + if dist <= threshold { + results.push((i, j, dist)); + } + } + } + results + } + + /// Convert PairwiseResult to sorted vec for comparison + fn result_to_sorted_vec(result: &PairwiseResult) -> Vec<(u64, u64, u32)> { + let mut v: Vec<_> = result + .row_id_a + .iter() + .zip(result.row_id_b.iter()) + .zip(result.distances.iter()) + .map(|((&a, &b), &d)| (a, b, d)) + .collect(); + v.sort(); + v + } + + #[test] + fn test_pairwise_correctness_small() { + // Deterministic hashes with known distances + let hashes = vec![ + 0b0000_0000u64, // 0 + 0b0000_0001u64, // 1 bit from 0 + 0b0000_0011u64, // 2 bits from 0, 1 bit from 1 + 0b0000_0111u64, // 3 bits from 0, 2 bits from 1, 1 bit from 2 + 0b0000_1111u64, // 4 bits from 0, 3 bits from 1, 2 bits from 2, 1 bit from 3 + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + let reference = reference_pairwise(&hashes, None); + + assert_eq!(result.len(), reference.len()); + assert_eq!(result.len(), 10); // C(5,2) = 10 pairs + + // Verify specific distances + let result_vec = result_to_sorted_vec(&result); + for (i, j, expected_dist) in &reference { + let found = result_vec + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + assert!(found.is_some(), "Missing pair ({}, {})", i, j); + assert_eq!( + found.unwrap().2, + *expected_dist, + "Wrong distance for pair ({}, {})", + i, + j + ); + } + } + + #[test] + fn test_pairwise_correctness_1000_deterministic() { + // Generate deterministic hashes using simple linear pattern + let hashes: Vec = (0u64..1000) + .map(|i| i.wrapping_mul(0x123456789ABCDEF)) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(10)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(10)); + let reference = reference_pairwise(&hashes, Some(10)); + + // Both implementations should match reference + assert_eq!( + result_seq.len(), + reference.len(), + "Sequential result count mismatch" + ); + assert_eq!( + result_par.len(), + reference.len(), + "Parallel result count mismatch" + ); + + // Verify all pairs match + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + + for (i, j, dist) in &reference { + let seq_found = seq_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + let par_found = par_sorted + .iter() + .find(|(a, b, _)| *a == *i as u64 && *b == *j as u64); + + assert!( + seq_found.is_some(), + "Sequential missing pair ({}, {})", + i, + j + ); + assert!(par_found.is_some(), "Parallel missing pair ({}, {})", i, j); + assert_eq!(seq_found.unwrap().2, *dist); + assert_eq!(par_found.unwrap().2, *dist); + } + } + + #[test] + fn test_pairwise_correctness_10000_deterministic() { + // Larger test with 10K hashes + let hashes: Vec = (0u64..10_000) + .map(|i| { + // Mix bits using a simple hash-like transformation + let x = i.wrapping_mul(0xDEADBEEFCAFEBABE); + x ^ (x >> 17) ^ (x << 13) + }) + .collect(); + + let result_seq = pairwise_hamming_distance(&hashes, None, Some(5)); + let result_par = pairwise_hamming_distance_parallel(&hashes, None, Some(5)); + + // Both should find the same number of pairs + assert_eq!( + result_seq.len(), + result_par.len(), + "10K test: sequential found {} pairs, parallel found {} pairs", + result_seq.len(), + result_par.len() + ); + + // Verify they contain the same pairs (sorted comparison) + let seq_sorted = result_to_sorted_vec(&result_seq); + let par_sorted = result_to_sorted_vec(&result_par); + assert_eq!(seq_sorted, par_sorted, "10K test: pair contents differ"); + } + + #[test] + fn test_pairwise_total_pairs_count() { + // Without threshold, should return exactly n*(n-1)/2 pairs + for n in [10, 50, 100, 500] { + let hashes: Vec = (0..n).map(|i| i as u64).collect(); + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + let expected = n * (n - 1) / 2; + assert_eq!( + result.len(), + expected, + "n={}: expected {} pairs, got {}", + n, + expected, + result.len() + ); + } + } + + #[test] + fn test_pairwise_threshold_filtering() { + // All identical hashes should have distance 0 + let hashes = vec![0xABCDEF0123456789u64; 100]; + let result = pairwise_hamming_distance_parallel(&hashes, None, Some(0)); + + // All pairs should be included (distance 0) + assert_eq!(result.len(), 100 * 99 / 2); + assert!(result.distances.iter().all(|&d| d == 0)); + + // With threshold 0 and all different hashes, should find fewer pairs + let different_hashes: Vec = (0u64..100).collect(); + let result2 = pairwise_hamming_distance_parallel(&different_hashes, None, Some(0)); + // Only pairs with identical values should match (none in this case except 0^0) + assert!(result2.len() < 100 * 99 / 2); + } + + #[test] + fn test_pairwise_row_ids_preserved() { + let hashes: Vec = (0u64..100).collect(); + let row_ids: Vec = (1000u64..1100).collect(); // offset row IDs + + let result = pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(5)); + + // All row IDs should be in range [1000, 1100) + for &id in &result.row_id_a { + assert!((1000..1100).contains(&id), "row_id_a {} out of range", id); + } + for &id in &result.row_id_b { + assert!((1000..1100).contains(&id), "row_id_b {} out of range", id); + } + // row_id_a should always be less than row_id_b (upper triangular) + for (&a, &b) in result.row_id_a.iter().zip(result.row_id_b.iter()) { + assert!(a < b, "Expected row_id_a < row_id_b, got {} >= {}", a, b); + } + } + + #[test] + fn test_pairwise_distance_bounds() { + // All distances should be in [0, 64] for u64 hashes + let hashes: Vec = (0u64..1000).map(|i| i.wrapping_mul(0x123456789)).collect(); + + let result = pairwise_hamming_distance_parallel(&hashes, None, None); + + for &d in &result.distances { + assert!(d <= 64, "Distance {} exceeds maximum 64", d); + } + } + + #[test] + fn test_pairwise_symmetry() { + // Hamming distance is symmetric: d(a,b) = d(b,a) + let hashes: Vec = vec![ + 0x0000000000000000, + 0xFFFFFFFFFFFFFFFF, + 0xAAAAAAAAAAAAAAAA, + 0x5555555555555555, + 0x123456789ABCDEF0, + ]; + + let result = pairwise_hamming_distance(&hashes, None, None); + + // For each pair (i,j), verify distance matches manual calculation + for idx in 0..result.len() { + let i = result.row_id_a[idx] as usize; + let j = result.row_id_b[idx] as usize; + let dist = result.distances[idx]; + + let expected = (hashes[i] ^ hashes[j]).count_ones(); + assert_eq!(dist, expected, "Distance mismatch for pair ({}, {})", i, j); + } + } + + #[test] + fn test_balanced_chunks() { + // Verify chunks are reasonably balanced + let n = 10000; + let total_pairs = n * (n - 1) / 2; + let target_per_chunk = total_pairs / 16; + + let chunks = compute_balanced_chunks(n, target_per_chunk); + + // Should have roughly 16 chunks + assert!( + chunks.len() >= 14 && chunks.len() <= 18, + "Expected ~16 chunks, got {}", + chunks.len() + ); + + // Each chunk should have roughly equal work + for (start, end) in &chunks { + let mut chunk_pairs = 0usize; + for i in *start..*end { + chunk_pairs += n - i - 1; + } + // Allow 20% deviation from target + let lower = target_per_chunk * 80 / 100; + // last chunk may be smaller + assert!( + chunk_pairs >= lower || *end == n, + "Chunk [{}, {}) has {} pairs, expected ~{}", + start, + end, + chunk_pairs, + target_per_chunk + ); + } + + // Chunks should cover all rows without gaps + assert_eq!(chunks[0].0, 0); + assert_eq!(chunks.last().unwrap().1, n); + for i in 1..chunks.len() { + assert_eq!(chunks[i].0, chunks[i - 1].1, "Gap between chunks"); + } + } + + // ========================================================================= + // SIMD-specific tests + // ========================================================================= + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_popcount() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); // 0 ^ 0 = 0 bits + assert_eq!(results[1], 1); // 0 ^ 1 = 1 bit + assert_eq!(results[2], 2); // 0 ^ 3 = 2 bits + assert_eq!(results[3], 3); // 0 ^ 7 = 3 bits + assert_eq!(results[4], 4); // 0 ^ 15 = 4 bits + assert_eq!(results[5], 5); // 0 ^ 31 = 5 bits + assert_eq!(results[6], 6); // 0 ^ 63 = 6 bits + assert_eq!(results[7], 7); // 0 ^ 127 = 7 bits + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx2_max_distance() { + if !is_x86_feature_detected!("avx2") { + return; + } + + let query = 0u64; + let targets = vec![u64::MAX; 4]; + let mut results = vec![0u32; 4]; + + unsafe { + hamming_batch_avx2(query, &targets, &mut results); + } + + for &r in &results { + assert_eq!(r, 64); + } + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_avx512_popcount() { + if !is_x86_feature_detected!("avx512vpopcntdq") || !is_x86_feature_detected!("avx512f") { + return; + } + + let query = 0u64; + let targets = vec![0u64, 1, 3, 7, 15, 31, 63, 127]; + let mut results = vec![0u32; 8]; + + unsafe { + hamming_batch_avx512(query, &targets, &mut results); + } + + assert_eq!(results[0], 0); + assert_eq!(results[1], 1); + assert_eq!(results[2], 2); + assert_eq!(results[3], 3); + assert_eq!(results[4], 4); + assert_eq!(results[5], 5); + assert_eq!(results[6], 6); + assert_eq!(results[7], 7); + } + + // ========================================================================= + // Additional clustering tests + // ========================================================================= + + #[test] + fn test_union_find_path_compression() { + let mut uf = UnionFind::new(); + + // Create a chain: 1 -> 2 -> 3 -> 4 -> 5 + uf.union(4, 5); + uf.union(3, 4); + uf.union(2, 3); + uf.union(1, 2); + + // All should have the same root + let root = uf.find(1); + assert_eq!(uf.find(2), root); + assert_eq!(uf.find(3), root); + assert_eq!(uf.find(4), root); + assert_eq!(uf.find(5), root); + } + + #[test] + fn test_cluster_edges_single_cluster() { + // All connected: 1-2-3-4-5 + let edges = vec![(1, 2), (2, 3), (3, 4), (4, 5)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + let cluster = &result.clusters[0]; + assert_eq!(cluster.representative, 1); + assert_eq!(cluster.duplicates.len(), 4); + assert_eq!(cluster.size(), 5); + } + + #[test] + fn test_cluster_edges_no_duplicates() { + // No edges means no clusters + let edges: Vec<(u64, u64)> = vec![]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 0); + assert_eq!(result.num_duplicates(), 0); + } + + #[test] + fn test_cluster_edges_self_loop() { + // Self-loop shouldn't create a cluster (size 1) + let edges = vec![(1, 1), (2, 3)]; + let result = cluster_edges(edges); + + // Only {2,3} should be a cluster + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 2); + } + + #[test] + fn test_cluster_edges_duplicate_edges() { + // Duplicate edges should be handled correctly + let edges = vec![(1, 2), (1, 2), (2, 3), (2, 3), (3, 1)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].size(), 3); + } + + #[test] + fn test_cluster_edges_large() { + // Create 100 clusters of size 10 each + let mut edges = Vec::new(); + for cluster_id in 0..100u64 { + let base = cluster_id * 10; + for i in 0..9 { + edges.push((base + i, base + i + 1)); + } + } + + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 100); + for cluster in &result.clusters { + assert_eq!(cluster.size(), 10); + assert_eq!(cluster.duplicates.len(), 9); + } + } + + #[test] + fn test_cluster_edges_random_order() { + // Same edges in different order should produce same result + let edges1 = vec![(1, 2), (2, 3), (4, 5), (3, 4)]; + let edges2 = vec![(4, 5), (1, 2), (3, 4), (2, 3)]; + let edges3 = vec![(3, 4), (4, 5), (2, 3), (1, 2)]; + + let r1 = cluster_edges(edges1); + let r2 = cluster_edges(edges2); + let r3 = cluster_edges(edges3); + + // All should produce the same single cluster + assert_eq!(r1.num_clusters(), 1); + assert_eq!(r2.num_clusters(), 1); + assert_eq!(r3.num_clusters(), 1); + + assert_eq!(r1.clusters[0].representative, 1); + assert_eq!(r2.clusters[0].representative, 1); + assert_eq!(r3.clusters[0].representative, 1); + + assert_eq!(r1.clusters[0].size(), 5); + assert_eq!(r2.clusters[0].size(), 5); + assert_eq!(r3.clusters[0].size(), 5); + } + + #[test] + fn test_cluster_edges_non_contiguous_ids() { + // Row IDs don't need to be contiguous + let edges = vec![(100, 200), (200, 500), (1000, 2000)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.representative == 100) + .unwrap(); + assert_eq!(c1.duplicates, vec![200, 500]); + + let c2 = result + .clusters + .iter() + .find(|c| c.representative == 1000) + .unwrap(); + assert_eq!(c2.duplicates, vec![2000]); + } + + #[test] + fn test_cluster_representative_is_minimum() { + // Representative should always be the minimum row ID in cluster + let edges = vec![ + (5, 3), + (3, 7), + (7, 1), // 1 is minimum + (100, 50), + (50, 75), // 50 is minimum + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + + let c1 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&7)) + .unwrap(); + assert_eq!(c1.representative, 1); + + let c2 = result + .clusters + .iter() + .find(|c| c.duplicates.contains(&100)) + .unwrap(); + assert_eq!(c2.representative, 50); + } + + #[test] + fn test_cluster_duplicates_sorted() { + // Duplicates should be sorted + let edges = vec![(1, 5), (1, 3), (1, 7), (1, 2)]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 1); + assert_eq!(result.clusters[0].representative, 1); + assert_eq!(result.clusters[0].duplicates, vec![2, 3, 5, 7]); + } + + #[test] + fn test_clustering_result_stats() { + let edges = vec![ + (1, 2), + (2, 3), // cluster of 3 + (10, 20), + (20, 30), + (30, 40), // cluster of 4 + ]; + let result = cluster_edges(edges); + + assert_eq!(result.num_clusters(), 2); + assert_eq!(result.num_duplicates(), 5); // 2 + 3 + assert_eq!(result.num_unique(), 2); + } } diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 440c3fb301a..6586c928de7 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -300,5 +300,9 @@ harness = false name = "concurrent_append" harness = false +[[bench]] +name = "hamming" +harness = false + [lints] workspace = true diff --git a/rust/lance/benches/hamming.rs b/rust/lance/benches/hamming.rs new file mode 100644 index 00000000000..7e926a795db --- /dev/null +++ b/rust/lance/benches/hamming.rs @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmark for hamming distance clustering. +//! +//! This benchmark tests the pairwise hamming distance computation and clustering +//! performance at various scales. +//! +//! Run with: cargo bench -p lance --bench hamming +//! +//! Environment variables: +//! - DATASET_URI: Path to a dataset with a hash column (optional, generates random if not set) +//! - HASH_COLUMN: Name of the hash column (default: "hash") +//! - SAMPLE_SIZE: Number of rows to sample (default: 10000) +//! - THRESHOLD: Hamming distance threshold (default: 10) + +#![allow(clippy::print_stdout)] + +use std::env; +use std::sync::Arc; +use std::time::Instant; + +use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator, UInt8Array}; +use arrow_schema::{DataType, Field, FieldRef, Schema}; +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use lance_arrow::FixedSizeListArrayExt; +use rand::Rng; + +use lance::index::vector::hamming::{ + hamming_clustering_for_sample, hamming_clustering_from_hashes, +}; +use lance::{Dataset, dataset::WriteParams}; +use lance_linalg::distance::pairwise_hamming_distance_parallel; + +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +/// Generate random 64-bit hashes. +fn generate_random_hashes(n: usize) -> Vec { + let mut rng = rand::rng(); + (0..n).map(|_| rng.random()).collect() +} + +/// Generate random hash dataset as Arrow arrays. +fn generate_hash_batch(num_rows: usize) -> RecordBatch { + let mut rng = rand::rng(); + + // Generate random bytes for the hashes (8 bytes per hash) + let bytes: Vec = (0..num_rows * 8).map(|_| rng.random()).collect(); + let values = UInt8Array::from(bytes); + + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + DataType::FixedSizeList(FieldRef::new(Field::new("item", DataType::UInt8, true)), 8), + false, + )])); + + RecordBatch::try_new(schema, vec![Arc::new(hash_array)]).unwrap() +} + +/// Create a test dataset with random hashes. +async fn create_hash_dataset(path: &std::path::Path, num_rows: usize) { + let batch = generate_hash_batch(num_rows); + let schema = batch.schema(); + + let write_params = WriteParams { + max_rows_per_file: num_rows, + max_rows_per_group: 10_000, + ..Default::default() + }; + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + Dataset::write(reader, path.to_str().unwrap(), Some(write_params)) + .await + .unwrap(); +} + +/// Benchmark pure pairwise hamming computation (no I/O). +fn bench_pairwise_compute(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_pairwise_compute"); + + for size in [1_000, 5_000, 10_000, 20_000] { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + group.throughput(Throughput::Elements(total_pairs)); + group.bench_with_input(BenchmarkId::new("parallel", size), &hashes, |b, hashes| { + b.iter(|| { + pairwise_hamming_distance_parallel(hashes, None, Some(10)); + }); + }); + } + + group.finish(); +} + +/// Benchmark full clustering pipeline (compute + cluster). +fn bench_cluster_hashes(c: &mut Criterion) { + let mut group = c.benchmark_group("hamming_cluster"); + + for size in [1_000, 5_000, 10_000] { + let hashes = generate_random_hashes(size); + + group.bench_with_input( + BenchmarkId::new("full_pipeline", size), + &hashes, + |b, hashes| { + b.iter(|| { + hamming_clustering_from_hashes(hashes, None, 10); + }); + }, + ); + } + + group.finish(); +} + +/// Benchmark with dataset I/O (if DATASET_URI is set). +fn bench_dataset_cluster(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + + // Check if we should use an external dataset + let dataset_uri = env::var("DATASET_URI").ok(); + let hash_column = env::var("HASH_COLUMN").unwrap_or_else(|_| "hash".to_string()); + let sample_size: usize = env::var("SAMPLE_SIZE") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10_000); + let threshold: u32 = env::var("THRESHOLD") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(10); + + let mut group = c.benchmark_group("hamming_dataset"); + + if let Some(uri) = dataset_uri { + // Use external dataset + println!("Using external dataset: {}", uri); + println!( + "Column: {}, Sample: {}, Threshold: {}", + hash_column, sample_size, threshold + ); + + let dataset = rt.block_on(async { Dataset::open(&uri).await.unwrap() }); + + group.bench_function(format!("external_sample_{}", sample_size), |b| { + b.to_async(&rt).iter(|| async { + hamming_clustering_for_sample(&dataset, &hash_column, Some(sample_size), threshold) + .await + .unwrap() + }); + }); + } else { + // Create temporary dataset with random hashes + let temp_dir = tempfile::tempdir().unwrap(); + let uri = temp_dir.path().join("bench_hashes.lance"); + + rt.block_on(async { + create_hash_dataset(&uri, 100_000).await; + }); + + let dataset = rt.block_on(async { Dataset::open(uri.to_str().unwrap()).await.unwrap() }); + + for sample in [1_000, 5_000, 10_000] { + group.bench_function(format!("generated_sample_{}", sample), |b| { + let ds = dataset.clone(); + b.to_async(&rt).iter(|| { + let ds = ds.clone(); + async move { + hamming_clustering_for_sample(&ds, "hash", Some(sample), 10) + .await + .unwrap() + } + }); + }); + } + } + + group.finish(); +} + +/// Quick standalone benchmark that prints results (for quick testing). +#[allow(dead_code)] +fn run_quick_bench() { + println!("=== Hamming Distance Clustering Benchmark ===\n"); + + let sizes = [1_000, 5_000, 10_000, 20_000]; + + for &size in &sizes { + let hashes = generate_random_hashes(size); + let total_pairs = (size as u64) * (size as u64 - 1) / 2; + + println!("Size: {} rows, {} pairs", size, total_pairs); + let start = Instant::now(); + let reader = hamming_clustering_from_hashes(&hashes, None, 10); + // Consume the reader to count clusters + let cluster_count: usize = reader.map(|b| b.unwrap().num_rows()).sum(); + let elapsed = start.elapsed(); + + let pairs_per_sec = total_pairs as f64 / elapsed.as_secs_f64(); + println!( + " Total time: {:?} ({:.2}M pairs/sec)", + elapsed, + pairs_per_sec / 1_000_000.0 + ); + println!(" Total clusters: {}", cluster_count); + println!(); + } +} + +#[cfg(target_os = "linux")] +criterion_group! { + name = benches; + config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_pairwise_compute, bench_cluster_hashes, bench_dataset_cluster +} + +#[cfg(not(target_os = "linux"))] +criterion_group!( + benches, + bench_pairwise_compute, + bench_cluster_hashes, + bench_dataset_cluster +); + +criterion_main!(benches); diff --git a/rust/lance/src/index/vector.rs b/rust/lance/src/index/vector.rs index 0eb66ea2ede..af48bc94c41 100644 --- a/rust/lance/src/index/vector.rs +++ b/rust/lance/src/index/vector.rs @@ -9,6 +9,7 @@ use std::{any::Any, collections::HashMap}; pub mod builder; pub(crate) mod details; +pub mod hamming; pub mod ivf; pub mod pq; pub mod utils; diff --git a/rust/lance/src/index/vector/hamming.rs b/rust/lance/src/index/vector/hamming.rs new file mode 100644 index 00000000000..ba6ea98c42d --- /dev/null +++ b/rust/lance/src/index/vector/hamming.rs @@ -0,0 +1,938 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Hamming distance clustering for IVF_FLAT indices. +//! +//! This module provides functionality to perform pairwise hamming distance +//! computation and clustering on specific partitions of IVF_FLAT indices. + +use std::time::Instant; + +use arrow_array::RecordBatchReader; +use arrow_array::cast::AsArray; +use arrow_array::types::UInt64Type; +use arrow_schema::DataType; +use lance_core::{Error, Result}; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::vector::VectorIndex; +use lance_index::vector::flat::index::{FlatBinQuantizer, FlatIndex}; +use lance_index::vector::flat::storage::FLAT_COLUMN; +use lance_index::vector::storage::VectorStore; +use lance_linalg::distance::{ + ClusteringResult, cluster_pairwise_result, extract_hashes_from_fixed_list, + pairwise_hamming_distance_parallel, +}; +use rand::rng; +use rand::seq::index::sample; + +use crate::dataset::Dataset; +use crate::index::{DatasetIndexExt, DatasetIndexInternalExt}; + +use super::ivf::v2::IVFIndex; + +/// Perform pairwise hamming distance clustering on a partition of an IVF_FLAT index. +/// +/// This function loads a specific partition from an IVF_FLAT index on a hash column, +/// computes pairwise hamming distances between all hashes in the partition, +/// filters by threshold, and clusters the results using union-find. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `index_name` - Name of the IVF_FLAT index on the hash column +/// * `partition_id` - The partition ID within the IVF_FLAT index +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The index doesn't exist or is not an IVF_FLAT index +/// - The indexed column has wrong type (must be `FixedSizeList`) +/// - The partition ID is out of range +pub async fn hamming_clustering_for_ivf_partition( + dataset: &Dataset, + index_name: &str, + partition_id: usize, + hamming_threshold: u32, +) -> Result> { + // Load indices and find the IVF_FLAT index + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Open the vector index + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + // Try to downcast to IVFIndex (IVF_FLAT for binary data) + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + // Check partition ID is valid + let num_partitions = ivf_index.ivf_model().num_partitions(); + if partition_id >= num_partitions { + return Err(Error::invalid_input(format!( + "Partition ID {} is out of range (0..{})", + partition_id, num_partitions + ))); + } + + // Load the partition storage + let storage = ivf_index.load_partition_storage(partition_id, None).await?; + + // Get row IDs + let row_id_slice: Vec = storage.row_ids().copied().collect(); + + if row_id_slice.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Get vectors from the storage batches + let batches: Vec<_> = storage.to_batches()?.collect(); + if batches.is_empty() { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Extract the hash vectors from the FLAT_COLUMN + let mut all_hashes = Vec::new(); + for batch in &batches { + let vectors = batch + .column_by_name(FLAT_COLUMN) + .ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in storage", FLAT_COLUMN)) + })? + .as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(vectors)?; + all_hashes.extend(hashes); + } + + // Compute pairwise hamming distances with threshold filtering + let pairwise_result = pairwise_hamming_distance_parallel( + &all_hashes, + Some(&row_id_slice), + Some(hamming_threshold), + ); + + // Cluster the results + let clustering = cluster_pairwise_result(&pairwise_result); + + Ok(clustering.into_reader(None)) +} + +/// Get partition statistics for an IVF_FLAT index. +pub async fn get_ivf_partition_info( + dataset: &Dataset, + index_name: &str, +) -> Result> { + let indices = dataset.load_indices().await?; + let index_meta = indices + .iter() + .find(|idx| idx.name == index_name) + .ok_or_else(|| { + Error::invalid_input(format!("Index '{}' not found on dataset", index_name)) + })?; + + // Get the column name from the index metadata + let schema = dataset.schema(); + let field_id = index_meta + .fields + .first() + .ok_or_else(|| Error::invalid_input(format!("Index '{}' has no fields", index_name)))?; + let field = schema.field_by_id(*field_id).ok_or_else(|| { + Error::invalid_input(format!( + "Field with id {} not found in schema for index '{}'", + field_id, index_name + )) + })?; + let column = &field.name; + + let index = dataset + .open_vector_index(column, &index_meta.uuid, &NoOpMetricsCollector) + .await?; + + let ivf_index = index + .as_any() + .downcast_ref::>() + .ok_or_else(|| { + Error::invalid_input(format!( + "Index '{}' is not an IVF_FLAT index for binary data", + index_name + )) + })?; + + let num_partitions = ivf_index.ivf_model().num_partitions(); + let mut partition_infos = Vec::with_capacity(num_partitions); + + for i in 0..num_partitions { + partition_infos.push(PartitionInfo { + partition_id: i, + size: ivf_index.ivf_model().partition_size(i), + }); + } + + Ok(partition_infos) +} + +/// Information about an IVF partition. +#[derive(Debug, Clone)] +pub struct PartitionInfo { + pub partition_id: usize, + pub size: usize, +} + +/// Perform pairwise hamming distance clustering on sampled rows from a dataset. +/// +/// This function samples N rows randomly from the dataset, extracts hashes, +/// computes pairwise hamming distances, and clusters the results. +/// It's useful for benchmarking and testing without requiring an IVF index. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `sample_size` - Number of rows to sample (if None or >= total rows, uses all rows) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub async fn hamming_clustering_for_sample( + dataset: &Dataset, + column: &str, + sample_size: Option, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get total row count + let total_rows: usize = dataset + .get_fragments() + .iter() + .filter_map(|f| f.metadata().physical_rows) + .sum(); + + let use_sampling = sample_size.is_some_and(|s| s < total_rows); + let effective_sample = sample_size.unwrap_or(total_rows).min(total_rows); + + // Read data + let (hashes, row_ids) = if use_sampling { + // Random sample using take() with _rowid (take uses positional indices) + let indices: Vec = sample(&mut rng(), total_rows, effective_sample) + .iter() + .map(|i| i as u64) + .collect(); + + let batch = dataset + .take( + &indices, + crate::dataset::ProjectionRequest::from_columns( + [column, "_rowid"], + dataset.schema(), + ), + ) + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in take result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + } else { + // Full scan + let batch = dataset + .scan() + .project(&[column])? + .with_row_id() + .try_into_batch() + .await?; + + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + let hash_col = batch.column_by_name(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in result", column)) + })?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + (hashes, row_id_vec) + }; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_ids), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on a contiguous range of rows from a fragment. +/// +/// This function reads a contiguous range of rows from a specific fragment, +/// extracts hashes, computes pairwise hamming distances, and clusters the results. +/// Unlike sampling, this reads sequential rows which is useful for distributed +/// processing where each worker handles a specific range of a fragment. +/// +/// # Arguments +/// +/// * `dataset` - The Lance dataset +/// * `column` - Name of the hash column (must be `FixedSizeList`) +/// * `fragment_id` - The fragment ID to read from +/// * `start_row` - The starting row offset within the fragment +/// * `num_rows` - Number of rows to read from the start position +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +/// +/// # Errors +/// +/// Returns an error if: +/// - The fragment doesn't exist +/// - The column has wrong type (must be `FixedSizeList`) +/// - The row range is out of bounds +pub async fn hamming_clustering_for_range( + dataset: &Dataset, + column: &str, + fragment_id: usize, + start_row: usize, + num_rows: usize, + hamming_threshold: u32, +) -> Result> { + // Validate column exists and has correct type + let schema = dataset.schema(); + let field = schema.field(column).ok_or_else(|| { + Error::invalid_input(format!("Column '{}' not found in dataset schema", column)) + })?; + + // Check column is FixedSizeList + let data_type = field.data_type(); + match data_type { + DataType::FixedSizeList(inner, 8) => { + if *inner.data_type() != DataType::UInt8 { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got FixedSizeList<{:?}, 8>", + column, + inner.data_type() + ))); + } + } + _ => { + return Err(Error::invalid_input(format!( + "Column '{}' must be FixedSizeList, got {:?}", + column, data_type + ))); + } + } + + // Get the fragment + let fragment = dataset.get_fragment(fragment_id).ok_or_else(|| { + Error::invalid_input(format!("Fragment with ID {} not found", fragment_id)) + })?; + + // Get fragment metadata for physical row count + let fragment_meta = fragment.metadata().clone(); + let physical_rows = fragment_meta + .physical_rows + .ok_or_else(|| Error::invalid_input("Fragment has no physical_rows metadata"))?; + + // Validate the range + if start_row >= physical_rows { + return Err(Error::invalid_input(format!( + "start_row {} is out of range for fragment with {} physical rows", + start_row, physical_rows + ))); + } + + // Adjust num_rows if it exceeds available rows + let effective_num_rows = num_rows.min(physical_rows - start_row); + + if effective_num_rows == 0 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Use scanner with the specific fragment and limit/offset + let batch = dataset + .scan() + .with_fragments(vec![fragment_meta]) + .project(&[column])? + .with_row_id() + .limit(Some(effective_num_rows as i64), Some(start_row as i64))? + .try_into_batch() + .await?; + + // Extract row IDs + let rowid_col = batch.column_by_name("_rowid").ok_or_else(|| { + Error::invalid_input("_rowid column not found in scan result".to_string()) + })?; + let row_ids = rowid_col.as_primitive::(); + let row_id_vec: Vec = row_ids.values().to_vec(); + + // Extract hashes + let hash_col = batch + .column_by_name(column) + .ok_or_else(|| Error::invalid_input(format!("Column '{}' not found in result", column)))?; + let hashes_arr = hash_col.as_fixed_size_list(); + let hashes = extract_hashes_from_fixed_list(hashes_arr)?; + + if hashes.len() < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return Ok(empty.into_reader(None)); + } + + // Compute pairwise hamming distances + let pairwise = + pairwise_hamming_distance_parallel(&hashes, Some(&row_id_vec), Some(hamming_threshold)); + + // Cluster edges + let clustering = cluster_pairwise_result(&pairwise); + + Ok(clustering.into_reader(None)) +} + +/// Perform pairwise hamming distance clustering on provided hashes (no I/O). +/// +/// This is useful for benchmarking the pure compute performance without I/O. +/// Logs timing information via tracing. +/// +/// # Arguments +/// +/// * `hashes` - Vector of 64-bit hash values +/// * `row_ids` - Optional row IDs (defaults to indices if None) +/// * `hamming_threshold` - Maximum hamming distance to consider as similar +/// +/// # Returns +/// +/// A `RecordBatchReader` yielding batches with columns: +/// - `representative`: UInt64 - The representative row ID for each cluster +/// - `duplicates`: `List` - List of duplicate row IDs in each cluster +pub fn hamming_clustering_from_hashes( + hashes: &[u64], + row_ids: Option<&[u64]>, + hamming_threshold: u32, +) -> Box { + let num_rows = hashes.len(); + if num_rows < 2 { + let empty = ClusteringResult { + clusters: Vec::new(), + }; + return empty.into_reader(None); + } + + let total_pairs = (num_rows as u64) * (num_rows as u64 - 1) / 2; + + // Compute pairwise hamming distances + let t_compute_start = Instant::now(); + let pairwise = pairwise_hamming_distance_parallel(hashes, row_ids, Some(hamming_threshold)); + let compute_time = t_compute_start.elapsed(); + + // Cluster edges + let t_cluster_start = Instant::now(); + let clustering = cluster_pairwise_result(&pairwise); + let cluster_time = t_cluster_start.elapsed(); + + // Log timing info + let pairs_per_sec = if compute_time.as_secs_f64() > 0.0 { + total_pairs as f64 / compute_time.as_secs_f64() + } else { + 0.0 + }; + tracing::info!( + num_rows, + total_pairs, + edges = pairwise.len(), + compute_time_ms = compute_time.as_millis(), + cluster_time_ms = cluster_time.as_millis(), + pairs_per_sec_millions = pairs_per_sec / 1_000_000.0, + num_clusters = clustering.num_clusters(), + num_duplicates = clustering.num_duplicates(), + "Hamming clustering completed" + ); + + clustering.into_reader(None) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Array; + + /// Helper to collect all clusters from a reader. + fn collect_clusters(reader: Box) -> Vec<(u64, Vec)> { + let mut clusters = Vec::new(); + for batch in reader { + let batch = batch.unwrap(); + let reps = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let dups = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + let rep = reps.value(i); + let dup_arr = dups.value(i); + let dup_values = dup_arr + .as_any() + .downcast_ref::() + .unwrap(); + let duplicates: Vec = dup_values.values().to_vec(); + clusters.push((rep, duplicates)); + } + } + clusters + } + + #[test] + fn test_hamming_clustering_from_hashes_basic() { + // Create some test hashes with known distances + let hashes = vec![ + 0b0000u64, // hash 0 + 0b0001u64, // hash 1 - distance 1 from hash 0 + 0b0011u64, // hash 2 - distance 1 from hash 1, distance 2 from hash 0 + 0b1111u64, // hash 3 - distance 2 from hash 2, distance 4 from hash 0 + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 1); + let clusters = collect_clusters(reader); + + // With threshold 1, pairs (0,1) and (1,2) should be connected + // This forms one cluster: {0, 1, 2} + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].1.len(), 2); // 2 duplicates in the cluster + } + + #[test] + fn test_hamming_clustering_from_hashes_no_clusters() { + // All hashes are far apart + let hashes = vec![ + 0x0000000000000000u64, + 0xFFFFFFFFFFFFFFFFu64, + 0xAAAAAAAAAAAAAAAAu64, + ]; + + let reader = hamming_clustering_from_hashes(&hashes, None, 5); + let clusters = collect_clusters(reader); + + // With threshold 5, no pairs should be connected (min distance is 32) + assert_eq!(clusters.len(), 0); + } + + #[test] + fn test_hamming_clustering_from_hashes_with_row_ids() { + let hashes = vec![0b0000u64, 0b0001u64]; + let row_ids = vec![100u64, 200u64]; + + let reader = hamming_clustering_from_hashes(&hashes, Some(&row_ids), 1); + let clusters = collect_clusters(reader); + + assert_eq!(clusters.len(), 1); + assert_eq!(clusters[0].0, 100); // representative + assert_eq!(clusters[0].1, vec![200]); // duplicates + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use lance_index::vector::ivf::IvfBuildParams; + use std::sync::Arc; + use tempfile::tempdir; + + // Create test data with hash column (FixedSizeList) + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Generate hashes with some duplicates (similar hashes) + let num_rows = 100; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create groups of similar hashes + let base = (i / 10) as u64; // 10 groups + let variation = (i % 10) as u64; + let hash = base.wrapping_mul(0x123456789) ^ variation; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = + FixedSizeListArray::try_new_from_values(values, 8).expect("create hash array"); + + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + // Write dataset + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Create IVF_FLAT index with 4 partitions + let ivf_params = IvfBuildParams::new(4); + let params = crate::index::vector::VectorIndexParams::with_ivf_flat_params( + lance_linalg::distance::MetricType::Hamming, + ivf_params, + ); + + dataset + .create_index( + &["hash"], + crate::index::IndexType::Vector, + None, + ¶ms, + false, + ) + .await + .unwrap(); + + // Load and test + let dataset = crate::Dataset::open(uri).await.unwrap(); + let indices = dataset.load_indices().await.unwrap(); + let index_name = &indices[0].name; + + // Test clustering on partition 0 + let reader = hamming_clustering_for_ivf_partition(&dataset, index_name, 0, 10) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Verify we get valid results (may or may not have clusters depending on data distribution) + // At minimum, verify no panics and valid schema + for (rep, dups) in &clusters { + assert!(*rep < num_rows as u64 * 10); // row IDs should be reasonable + for dup in dups { + assert!(*dup < num_rows as u64 * 10); + } + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_ivf_partition_invalid_index() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let dataset = crate::Dataset::write(reader, uri, None).await.unwrap(); + + // Test with non-existent index + let result = hamming_clustering_for_ivf_partition(&dataset, "nonexistent", 0, 10).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + } + + #[tokio::test] + async fn test_hamming_clustering_for_sample_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + // Create some identical hashes (groups of 5) + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test full scan (no sampling) + let reader = hamming_clustering_for_sample(&dataset, "hash", None, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 (exact match) and groups of 5 identical hashes, + // we should have 10 clusters with 4 duplicates each + assert_eq!(clusters.len(), 10); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test with sampling + let reader = hamming_clustering_for_sample(&dataset, "hash", Some(20), 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + // With sampling, we may get fewer clusters + assert!(clusters.len() <= 10); + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_integration() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + // Create 50 rows with some duplicate hashes (groups of 5 identical hashes) + let num_rows = 50; + let mut hash_bytes = Vec::with_capacity(num_rows * 8); + for i in 0..num_rows { + let hash = (i / 5) as u64; + hash_bytes.extend_from_slice(&hash.to_le_bytes()); + } + let values = UInt8Array::from(hash_bytes); + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Get fragment info + let fragments = dataset.get_fragments(); + assert_eq!(fragments.len(), 1); + let fragment_id = fragments[0].id() as usize; + + // Test reading range from the fragment + // Reading rows 0-25 should cover groups 0-4 (5 groups, each with 5 rows) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 0, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // With threshold 0 and 25 rows (groups 0-4), we should have 5 clusters + // Each cluster has 4 duplicates (5 identical hashes - 1 representative = 4 duplicates) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + + // Test reading a different range (rows 25-50) + let reader = hamming_clustering_for_range(&dataset, "hash", fragment_id, 25, 25, 0) + .await + .unwrap(); + let clusters = collect_clusters(reader); + + // Should have 5 clusters (groups 5-9) + assert_eq!(clusters.len(), 5); + for (_, dups) in &clusters { + assert_eq!(dups.len(), 4); + } + } + + #[tokio::test] + async fn test_hamming_clustering_for_range_invalid_fragment() { + use arrow_array::{FixedSizeListArray, RecordBatchIterator, UInt8Array}; + use arrow_schema::{Field, Schema}; + use lance_arrow::FixedSizeListArrayExt; + use std::sync::Arc; + use tempfile::tempdir; + + let schema = Arc::new(Schema::new(vec![Field::new( + "hash", + arrow_schema::DataType::FixedSizeList( + Arc::new(Field::new("item", arrow_schema::DataType::UInt8, true)), + 8, + ), + false, + )])); + + let values = UInt8Array::from(vec![0u8; 80]); // 10 rows * 8 bytes + let hash_array = FixedSizeListArray::try_new_from_values(values, 8).unwrap(); + let batch = + arrow_array::RecordBatch::try_new(schema.clone(), vec![Arc::new(hash_array)]).unwrap(); + + let temp_dir = tempdir().unwrap(); + let uri = temp_dir.path().to_str().unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + crate::Dataset::write(reader, uri, None).await.unwrap(); + + let dataset = crate::Dataset::open(uri).await.unwrap(); + + // Test with non-existent fragment + let result = hamming_clustering_for_range(&dataset, "hash", 999, 0, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("not found"), "Error: {}", err); + + // Test with out-of-range start_row + let result = hamming_clustering_for_range(&dataset, "hash", 0, 1000, 10, 0).await; + assert!(result.is_err()); + let err = result.err().unwrap(); + assert!(err.to_string().contains("out of range"), "Error: {}", err); + } +} From 0fd0c37d5ec2be33e15b5ee6386564ac1603092a Mon Sep 17 00:00:00 2001 From: YueZhang <69956021+zhangyue19921010@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:54:51 +0800 Subject: [PATCH 156/177] fix(ci): replace deprecated array.shape assignment for NumPy 2.5 (#7384) Unblock CI ``` DeprecationWarning: Setting the shape on a NumPy array has been deprecated in NumPy 2.5. As an alternative, you can create a new view using np.reshape (with copy=False if needed). ``` --- python/python/lance/indices/builder.py | 2 +- python/python/tests/test_indices.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/python/lance/indices/builder.py b/python/python/lance/indices/builder.py index d3d61c5f8ff..6059166d6ba 100644 --- a/python/python/lance/indices/builder.py +++ b/python/python/lance/indices/builder.py @@ -150,7 +150,7 @@ def train_ivf( max_iters=max_iters, ) num_dims = ivf_centroids.shape[1] - ivf_centroids.shape = -1 + ivf_centroids = ivf_centroids.reshape(-1) flat_centroids_array = pa.array(ivf_centroids) centroids_array = pa.FixedSizeListArray.from_arrays( flat_centroids_array, num_dims diff --git a/python/python/tests/test_indices.py b/python/python/tests/test_indices.py index 7f6595f2ecc..02cf64541d6 100644 --- a/python/python/tests/test_indices.py +++ b/python/python/tests/test_indices.py @@ -25,7 +25,7 @@ def make_ds(num_rows: int, rows_per_frag: int, tmpdir: pathlib.Path, dtype: str): vectors = np.random.randn(num_rows, DIMENSION).astype(dtype) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) table = pa.Table.from_arrays([vectors], names=["vectors"]) uri = str(tmpdir / "dataset") @@ -53,7 +53,7 @@ def small_rand_dataset(tmpdir, request): @pytest.fixture def mostly_null_dataset(tmpdir, request): vectors = np.random.randn(NUM_ROWS, DIMENSION).astype(np.float32) - vectors.shape = -1 + vectors = vectors.reshape(-1) vectors = pa.FixedSizeListArray.from_arrays(vectors, DIMENSION) vectors = vectors.to_pylist() vectors = [vec if i % 10 == 0 else None for i, vec in enumerate(vectors)] @@ -219,7 +219,7 @@ def test_ivf_centroids_fragment_ids(tmpdir): ], axis=0, ) - vectors.shape = -1 + vectors = vectors.reshape(-1) table = pa.Table.from_arrays( [pa.FixedSizeListArray.from_arrays(vectors, DIMENSION)], names=["vectors"] ) From 9152d61d14259e8d656a7f1b7d0a64a680fefeb7 Mon Sep 17 00:00:00 2001 From: LeoReeYang <58654486+LeoReeYang@users.noreply.github.com> Date: Mon, 22 Jun 2026 20:15:53 +0800 Subject: [PATCH 157/177] perf(knn): reduce memory for batch flat vector search (#6950) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Closes #6940. Batch flat KNN previously kept a full scan `RecordBatch` (including the vector column) for each top-k candidate, which could use on the order of `m × k × d × 4` bytes when many query vectors and large `k`/`d` are used. This PR changes batch flat KNN so that: - If the **vector column is not in the final projection**, the top-k heap stores only row identifiers and distances (not whole batches or vector data). - If the **vector column is projected**, each candidate keeps a single-row vector plus a shared scan batch **without** the vector column (for any other columns needed on output). - If the flat scan already reads **non-vector columns** (for example columns referenced by a filter), only the vector column is dropped from what we retain; other columns can still be assembled in the KNN node without keeping vectors in the heap. ## Behavior - [`Scanner::flat_knn`](rust/lance/src/dataset/scanner.rs) sets `retain_vector` from whether the user projection includes the vector field. - [`KNNVectorDistanceExec`](rust/lance/src/io/exec/knn.rs) batch mode maintains a **per-query** top-k heap across all scan batches and emits one result batch with `query_index` and `_distance`. - Indexed batch KNN is unchanged; this targets **flat** batch vector search only. ## Memory (qualitative) | Projection / scan input | What top-k heap retains | |-------------------------|-------------------------| | No vector in projection; scan is `vec + _rowid` | Row id + distance only | | Vector in projection | Shared batch without vector column + one row of vector per candidate | | No vector in projection; scan includes other columns | Shared batch without vector column (no vector rows stored) | ## Test plan - [x] `cargo test -p lance --lib test_batch_knn_flat` - [x] `cargo test -p lance --lib test_batch_knn_flat_omits_vector_without_projection` - [x] `cargo test -p lance --lib test_batch_knn_flat_filter_keeps_non_vector_columns` - [x] `cargo test -p lance --lib test_batch_knn_flat_respects_distance_range` - [x] `cargo fmt --all` and `cargo clippy -p lance --lib -- -D warnings` --------- Co-authored-by: BubbleCal --- rust/lance/src/dataset/scanner.rs | 323 +++++++++++++ rust/lance/src/io/exec/knn.rs | 746 ++++++++++++++++++++++++++++-- 2 files changed, 1018 insertions(+), 51 deletions(-) diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 09cd7023e74..d4b58e4783f 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4481,6 +4481,14 @@ impl Scanner { } else { input }; + let retain_vector = if self.is_batch_nearest { + let vector_field_id = self.dataset.schema().field_id(q.column.as_str())?; + self.projection_plan + .physical_projection + .contains_field_id(vector_field_id) + } else { + false + }; let flat_dist = Arc::new(KNNVectorDistanceExec::try_new_batch( input, &q.column, @@ -4492,6 +4500,7 @@ impl Scanner { lower_bound: q.lower_bound, upper_bound: q.upper_bound, distance_type: metric_type, + retain_vector, }, )?); @@ -5942,6 +5951,114 @@ mod test { (queries, query_values) } + async fn nested_vector_test_dataset(dim: u32) -> (TempStrDir, Dataset) { + let path = TempStrDir::default(); + let vec_field = ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + dim as i32, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field.clone(), + ])); + + let batches: Vec = (0..5) + .map(|batch_idx| { + let vector_values: Float32Array = (0..dim * 80).map(|v| v as f32).collect(); + let vectors = + FixedSizeListArray::try_new_from_values(vector_values, dim as i32).unwrap(); + let payload = StructArray::from(vec![( + Arc::new(vec_field.clone()), + Arc::new(vectors) as ArrayRef, + )]); + RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values( + batch_idx * 80..(batch_idx + 1) * 80, + )), + Arc::new(payload), + ], + ) + .unwrap() + }) + .collect(); + + let params = WriteParams { + max_rows_per_group: 10, + max_rows_per_file: 200, + data_storage_version: Some(LanceFileVersion::Stable), + enable_stable_row_ids: true, + ..Default::default() + }; + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, &path, Some(params)).await.unwrap(); + (path, dataset) + } + + async fn escaped_nested_vector_test_dataset(dim: u32) -> (TempStrDir, Dataset) { + let path = TempStrDir::default(); + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + dim as i32, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field.clone(), + ])); + + let batches: Vec = (0..5) + .map(|batch_idx| { + let vector_values: Float32Array = (0..dim * 80).map(|v| v as f32).collect(); + let vectors = + FixedSizeListArray::try_new_from_values(vector_values, dim as i32).unwrap(); + let payload = StructArray::from(vec![( + Arc::new(vec_field.clone()), + Arc::new(vectors) as ArrayRef, + )]); + RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values( + batch_idx * 80..(batch_idx + 1) * 80, + )), + Arc::new(payload), + ], + ) + .unwrap() + }) + .collect(); + + let params = WriteParams { + max_rows_per_group: 10, + max_rows_per_file: 200, + data_storage_version: Some(LanceFileVersion::Stable), + enable_stable_row_ids: true, + ..Default::default() + }; + let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, &path, Some(params)).await.unwrap(); + (path, dataset) + } + fn assert_query_index_field(batch: &RecordBatch) { let schema = batch.schema(); let field = schema.field(0); @@ -5950,6 +6067,14 @@ mod test { assert!(!field.is_nullable()); } + fn assert_batch_knn_output_has_no_vector(batch: &RecordBatch, vector_column: &str) { + assert!( + batch.schema().column_with_name(vector_column).is_none(), + "batch flat KNN output must not include vector column '{vector_column}' when it is not projected; columns: {:?}", + batch.schema().field_names() + ); + } + async fn assert_batch_matches_single_queries( dataset: &Dataset, batch: &RecordBatch, @@ -6024,6 +6149,7 @@ mod test { let batch = scan.try_into_batch().await.unwrap(); assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); assert_eq!( batch.num_rows(), 2 * k, @@ -6046,6 +6172,25 @@ mod test { } assert_batch_matches_single_queries(dataset, &batch, &query_values, k, false, None).await; + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest("vec", &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&["i", "vec"]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec.schema().column_with_name("vec").is_some(), + "batch flat KNN should return vector column when projected" + ); + assert_batch_matches_single_queries( + dataset, + &batch_with_vec, + &query_values, + k, + false, + None, + ) + .await; + let query_values_one = (32..64).map(|v| v as f32).collect::>(); let queries_one = FixedSizeListArray::try_new_from_values( Float32Array::from(query_values_one.clone()), @@ -6071,12 +6216,190 @@ mod test { let batch = scan.try_into_batch().await.unwrap(); assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); assert_eq!( batch[QUERY_INDEX_COL].as_primitive::().values(), &[0, 0] ); } + #[tokio::test] + async fn test_batch_knn_flat_omits_vector_without_projection() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert_query_index_field(&batch); + assert!(batch.schema().column_with_name("i").is_some()); + assert!(batch.schema().column_with_name(DIST_COL).is_some()); + assert_batch_matches_single_queries(dataset, &batch, &query_values, k, false, None).await; + + let mut scan_rowid_only = dataset.scan(); + scan_rowid_only.nearest("vec", &queries, k).unwrap(); + scan_rowid_only.use_index(false); + scan_rowid_only.project(&[ROW_ID]).unwrap(); + let batch_rowid_only = scan_rowid_only.try_into_batch().await.unwrap(); + assert_batch_knn_output_has_no_vector(&batch_rowid_only, "vec"); + assert!(batch_rowid_only.schema().column_with_name(ROW_ID).is_some()); + assert!(batch_rowid_only.schema().column_with_name("i").is_none()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest("vec", &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&["vec"]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec.schema().column_with_name("vec").is_some(), + "batch flat KNN must include vector column when vec is projected" + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_filter_keeps_non_vector_columns() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.filter("i >= 0").unwrap(); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert!(batch.schema().column_with_name("i").is_some()); + + let query_indices = batch[QUERY_INDEX_COL].as_primitive::(); + for query_index in 0..2 { + let query = + Float32Array::from(query_values[query_index * 32..(query_index + 1) * 32].to_vec()); + let mut single = dataset.scan(); + single.nearest("vec", &query, k).unwrap(); + single.use_index(false); + single.filter("i >= 0").unwrap(); + single.project(&["i"]).unwrap(); + let single_batch = single.try_into_batch().await.unwrap(); + + let mask = BooleanArray::from_iter( + query_indices + .iter() + .map(|value| value.map(|value| value == query_index as i32)), + ); + let batch_slice = arrow::compute::filter_record_batch(&batch, &mask).unwrap(); + assert_eq!( + batch_slice["i"].as_primitive::().values(), + single_batch["i"].as_primitive::().values() + ); + } + } + + #[tokio::test] + async fn test_batch_knn_flat_nested_vector_projection() { + const VECTOR_COLUMN: &str = "payload.vec"; + let (_tmp, dataset) = nested_vector_test_dataset(32).await; + let k = 2; + let (queries, _query_values) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, VECTOR_COLUMN); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name("i").is_some()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&[VECTOR_COLUMN]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec + .schema() + .column_with_name(VECTOR_COLUMN) + .is_some(), + "batch flat KNN must include nested vector column when projected; columns: {:?}", + batch_with_vec.schema().field_names() + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_escaped_nested_vector_projection() { + const VECTOR_COLUMN: &str = "payload.`vec.with.dot`"; + let (_tmp, dataset) = escaped_nested_vector_test_dataset(32).await; + let k = 2; + let (queries, _) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan.use_index(false); + scan.project(&["i"]).unwrap(); + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, VECTOR_COLUMN); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name("i").is_some()); + + let mut scan_with_vec = dataset.scan(); + scan_with_vec.nearest(VECTOR_COLUMN, &queries, k).unwrap(); + scan_with_vec.use_index(false); + scan_with_vec.project(&[VECTOR_COLUMN]).unwrap(); + let batch_with_vec = scan_with_vec.try_into_batch().await.unwrap(); + assert!( + batch_with_vec + .schema() + .column_with_name(VECTOR_COLUMN) + .is_some(), + "batch flat KNN must include escaped nested vector column when projected; columns: {:?}", + batch_with_vec.schema().field_names() + ); + } + + #[tokio::test] + async fn test_batch_knn_flat_projects_row_id_and_row_addr_without_vector() { + let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) + .await + .unwrap(); + let dataset = &test_ds.dataset; + let k = 2; + let (queries, _) = batch_knn_two_queries(); + + let mut scan = dataset.scan(); + scan.nearest("vec", &queries, k).unwrap(); + scan.use_index(false); + scan.project(&[ROW_ID]).unwrap(); + scan.with_row_address(); + + let batch = scan.try_into_batch().await.unwrap(); + assert_query_index_field(&batch); + assert_batch_knn_output_has_no_vector(&batch, "vec"); + assert_eq!(batch.num_rows(), 2 * k); + assert!(batch.schema().column_with_name(ROW_ID).is_some()); + assert!(batch.schema().column_with_name(ROW_ADDR).is_some()); + assert!(batch.schema().column_with_name(DIST_COL).is_some()); + assert_eq!( + batch[ROW_ADDR].as_primitive::().null_count(), + 0, + "row addresses should be materialized for all top-k rows" + ); + } + #[tokio::test] async fn test_primitive_query_length_multiple_of_dim_is_rejected() { let test_ds = TestVectorDataset::new(LanceFileVersion::Stable, true) diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 0ceddf7c5ee..73e901aee04 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -17,7 +17,6 @@ use arrow_array::{ cast::AsArray, }; use arrow_schema::{DataType, Field, Schema, SchemaRef}; -use arrow_select::concat::concat_batches; use datafusion::physical_plan::PlanProperties; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ @@ -157,6 +156,7 @@ pub struct KNNVectorDistanceExec { pub upper_bound: Option, pub column: String, pub distance_type: DistanceType, + retain_vector: bool, input_schema: SchemaRef, output_schema: SchemaRef, @@ -172,10 +172,11 @@ pub struct KnnBatchParams { pub lower_bound: Option, pub upper_bound: Option, pub distance_type: DistanceType, + pub retain_vector: bool, } struct BatchKnnConfig { - input_schema: SchemaRef, + stored_schema: SchemaRef, output_schema: SchemaRef, column: String, query: ArrayRef, @@ -184,6 +185,7 @@ struct BatchKnnConfig { lower_bound: Option, upper_bound: Option, distance_type: DistanceType, + retain_vector: bool, } impl DisplayAs for KNNVectorDistanceExec { @@ -216,6 +218,120 @@ impl DisplayAs for KNNVectorDistanceExec { } impl KNNVectorDistanceExec { + fn remove_field_path_from_fields( + fields: &[Arc], + path: &[String], + ) -> DataFusionResult>> { + if path.is_empty() { + return Ok(fields.to_vec()); + } + let mut removed = false; + let mut new_fields = Vec::with_capacity(fields.len()); + for field in fields { + if field.name() != &path[0] { + new_fields.push(field.clone()); + continue; + } + removed = true; + if path.len() == 1 { + continue; + } + match field.data_type() { + DataType::Struct(children) => { + let child_fields = children.iter().cloned().collect::>(); + let projected_children = + Self::remove_field_path_from_fields(&child_fields, &path[1..])?; + if projected_children.is_empty() { + continue; + } + let updated = Field::new( + field.name(), + DataType::Struct(projected_children.into()), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()); + new_fields.push(Arc::new(updated)); + } + _ => { + return Err(DataFusionError::Internal(format!( + "batch KNN cannot remove nested path '{}': '{}' is not a struct", + path.join("."), + field.name() + ))); + } + } + } + if !removed { + return Err(DataFusionError::Internal(format!( + "batch KNN expected vector column '{}' in scan batch schema", + path.join(".") + ))); + } + Ok(new_fields) + } + + fn remove_vector_from_schema(schema: &Schema, column: &str) -> DataFusionResult { + let path = lance_core::datatypes::parse_field_path(column).map_err(|err| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {err}" + )) + })?; + let fields = schema.fields().iter().cloned().collect::>(); + let updated_fields = Self::remove_field_path_from_fields(&fields, &path)?; + Ok(Schema::new_with_metadata( + updated_fields, + schema.metadata().clone(), + )) + } + + fn remove_vector_from_batch( + batch: &RecordBatch, + column: &str, + ) -> DataFusionResult { + let slim_schema = Self::remove_vector_from_schema(batch.schema().as_ref(), column)?; + batch + .project_by_schema(&slim_schema) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + fn resolve_vector_column(batch: &RecordBatch, column: &str) -> DataFusionResult { + if let Some(col) = batch.column_by_name(column) { + return Ok(col.clone()); + } + let parts = lance_core::datatypes::parse_field_path(column).map_err(|e| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {e}" + )) + })?; + if parts.is_empty() { + return Err(DataFusionError::Internal(format!( + "batch KNN has invalid empty vector column path '{column}'" + ))); + } + let mut current = batch.column_by_name(&parts[0]).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected vector column '{column}' in scan batch (missing root field '{}')", + parts[0] + )) + })?; + for part in &parts[1..] { + let struct_array = current + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected struct while resolving '{column}', but parent of '{part}' was not a struct" + )) + })?; + current = struct_array.column_by_name(part).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected vector column '{column}' in scan batch (missing nested field '{part}')" + )) + })?; + } + Ok(current) + } + /// Create a new [`KNNVectorDistanceExec`] node. /// /// Returns an error if the preconditions are not met. @@ -236,6 +352,7 @@ impl KNNVectorDistanceExec { lower_bound: None, upper_bound: None, distance_type, + retain_vector: false, }, ) } @@ -253,6 +370,7 @@ impl KNNVectorDistanceExec { lower_bound, upper_bound, distance_type, + retain_vector, } = params; if query_count == 0 { return Err(Error::invalid_input( @@ -287,13 +405,19 @@ impl KNNVectorDistanceExec { "batch KNN cannot run when the input already contains reserved column '{QUERY_INDEX_COL}'" ))); } - let input_schema = Arc::new(input_schema); + + let stored_schema = if is_batch && !retain_vector { + Arc::new(Self::remove_vector_from_schema(&input_schema, column)?) + } else { + Arc::new(input_schema) + }; + let output_schema = if is_batch { - input_schema + stored_schema .as_ref() .try_with_column_at(0, query_index_field())? } else { - input_schema.as_ref().clone() + stored_schema.as_ref().clone() }; let output_schema = Arc::new(output_schema.try_with_column(Field::new( DIST_COL, @@ -330,19 +454,230 @@ impl KNNVectorDistanceExec { upper_bound, column: column.to_string(), distance_type, - input_schema, + retain_vector, + input_schema: stored_schema, output_schema, properties, metrics: ExecutionPlanMetricsSet::new(), }) } + fn take_vector_row(vectors: &dyn Array, row_index: u32) -> DataFusionResult { + let indices = UInt32Array::from_iter([Some(row_index)]); + arrow_select::take::take(vectors, &indices, None) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + fn take_slim_batch_field( + results: &[BatchKnnCandidate], + field_name: &str, + ) -> DataFusionResult { + Self::take_slim_batch_field_if_present(results, field_name)?.ok_or_else(|| { + DataFusionError::Internal(format!("column '{field_name}' missing from slim batch")) + }) + } + + fn take_slim_batch_field_if_present( + results: &[BatchKnnCandidate], + field_name: &str, + ) -> DataFusionResult> { + use std::collections::HashMap; + + type SlimBatchGroup = (Arc, Vec<(usize, u32)>); + let mut groups: HashMap<*const RecordBatch, SlimBatchGroup> = HashMap::new(); + for (result_index, candidate) in results.iter().enumerate() { + let BatchKnnExtra::WithSlimBatch { + slim_batch, + row_index, + .. + } = &candidate.extra + else { + return Err(DataFusionError::Internal( + "batch KNN expected slim batch in candidate heap".to_string(), + )); + }; + groups + .entry(Arc::as_ptr(slim_batch)) + .or_insert_with(|| (Arc::clone(slim_batch), Vec::new())) + .1 + .push((result_index, *row_index)); + } + + let mut ordered: Vec> = vec![None; results.len()]; + for (_, (slim_batch, entries)) in groups { + let indices = + UInt32Array::from_iter(entries.iter().map(|(_, row_index)| Some(*row_index))); + let taken = arrow_select::take::take_record_batch(slim_batch.as_ref(), &indices) + .map_err(|e| { + DataFusionError::ArrowError(Box::new(e), Some("take top-k rows".to_string())) + })?; + let Some(column) = taken.column_by_name(field_name) else { + continue; + }; + for (offset, (result_index, _)) in entries.iter().enumerate() { + ordered[*result_index] = Some(column.slice(offset, 1)); + } + } + if ordered.iter().all(Option::is_none) { + return Ok(None); + } + if ordered.iter().any(Option::is_none) { + return Err(DataFusionError::Internal(format!( + "column '{field_name}' inconsistently present in slim batches" + ))); + } + + let row_arrays: Vec<&dyn Array> = ordered + .iter() + .map(|array| { + array + .as_ref() + .expect("every result mapped from slim batch") + .as_ref() + }) + .collect(); + arrow::compute::concat(&row_arrays) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + .map(Some) + } + + fn build_struct_column_for_path( + field: &Field, + path: &[String], + leaf_column: ArrayRef, + slim_column: Option<&dyn Array>, + ) -> DataFusionResult { + if path.is_empty() { + return Ok(leaf_column); + } + let DataType::Struct(children) = field.data_type() else { + return Err(DataFusionError::Internal(format!( + "batch KNN expected struct field '{}' while rebuilding nested vector path '{}'", + field.name(), + path.join(".") + ))); + }; + let slim_struct = slim_column + .map(|column| { + column + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "batch KNN expected slim column '{}' to be a struct while rebuilding nested vector path '{}'", + field.name(), + path.join(".") + )) + }) + }) + .transpose()?; + let mut columns = Vec::with_capacity(children.len()); + for child in children.iter() { + if child.name() == &path[0] { + if path.len() == 1 { + columns.push(leaf_column.clone()); + } else { + let child_slim_column = slim_struct + .and_then(|struct_array| struct_array.column_by_name(child.name())); + columns.push(Self::build_struct_column_for_path( + child, + &path[1..], + leaf_column.clone(), + child_slim_column.map(|column| column.as_ref()), + )?); + } + } else if let Some(column) = + slim_struct.and_then(|struct_array| struct_array.column_by_name(child.name())) + { + columns.push(column.clone()); + } else { + columns.push(arrow_array::new_null_array( + child.data_type(), + leaf_column.len(), + )); + } + } + let struct_array = arrow_array::StructArray::try_new( + children.clone(), + columns, + slim_struct.and_then(|struct_array| struct_array.nulls().cloned()), + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + Ok(Arc::new(struct_array)) + } + + fn take_retained_vector_column( + results: &[BatchKnnCandidate], + field: &Field, + field_path: &[String], + ) -> DataFusionResult { + let vector_rows: Vec<&dyn Array> = results + .iter() + .map(|candidate| { + let BatchKnnExtra::WithSlimBatch { + vector_row: Some(vector_row), + .. + } = &candidate.extra + else { + return Err(DataFusionError::Internal( + "batch KNN expected vector rows in candidate heap".to_string(), + )); + }; + Ok(vector_row.as_ref()) + }) + .collect::>>()?; + let leaf_column = arrow::compute::concat(&vector_rows) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + if field_path.len() <= 1 { + Ok(leaf_column) + } else { + let slim_column = Self::take_slim_batch_field_if_present(results, field.name())?; + Self::build_struct_column_for_path( + field, + &field_path[1..], + leaf_column, + slim_column.as_deref(), + ) + } + } + + fn assemble_batch_output( + results: &[BatchKnnCandidate], + stored_schema: &Schema, + column: &str, + retain_vector: bool, + ) -> DataFusionResult { + let field_path = lance_core::datatypes::parse_field_path(column).map_err(|e| { + DataFusionError::Internal(format!( + "batch KNN failed to parse vector column path '{column}': {e}" + )) + })?; + let mut columns: Vec = Vec::with_capacity(stored_schema.fields().len()); + for field in stored_schema.fields() { + if field.name() == ROW_ID { + let row_ids = + UInt64Array::from_iter(results.iter().map(|candidate| Some(candidate.row_id))); + columns.push(Arc::new(row_ids)); + } else if retain_vector && !field_path.is_empty() && field.name() == &field_path[0] { + columns.push(Self::take_retained_vector_column( + results, + field, + &field_path, + )?); + } else { + columns.push(Self::take_slim_batch_field(results, field.name())?); + } + } + RecordBatch::try_new(Arc::new(stored_schema.clone()), columns) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + async fn execute_batch( input: SendableRecordBatchStream, config: BatchKnnConfig, ) -> DataFusionResult { let BatchKnnConfig { - input_schema, + stored_schema, output_schema, column, query, @@ -351,8 +686,10 @@ impl KNNVectorDistanceExec { lower_bound, upper_bound, distance_type, + retain_vector, } = config; let query_dim = query.len() / query_count; + let needs_slim_batch = stored_schema.fields().iter().any(|f| f.name() != ROW_ID); let mut heaps = (0..query_count) .map(|_| BinaryHeap::::with_capacity(k)) .collect::>(); @@ -374,6 +711,13 @@ impl KNNVectorDistanceExec { .as_primitive::() .clone(); + let mut slim_batch: Option> = None; + let vectors = if retain_vector { + Some(Self::resolve_vector_column(&batch, &column)?) + } else { + None + }; + for (query_index, heap) in heaps.iter_mut().enumerate().take(query_count) { let key = query.slice(query_index * query_dim, query_dim); let with_distances = compute_distance(key, distance_type, &column, batch.clone()) @@ -398,20 +742,42 @@ impl KNNVectorDistanceExec { } let query_index = query_index as i32; let row_id = row_ids.value(row_index); - let row_index = row_index as u32; + if !would_enter_heap(heap, k, distance, row_id, query_index) { + continue; + } + + let extra = if retain_vector || needs_slim_batch { + let row_index = row_index as u32; + if slim_batch.is_none() { + let slim = Self::remove_vector_from_batch(&batch, &column)?; + slim_batch = Some(Arc::new(slim)); + } + let slim_batch = slim_batch.as_ref().expect("slim batch"); + let vector_row = if retain_vector { + Some(Self::take_vector_row( + vectors.as_ref().expect("vectors"), + row_index, + )?) + } else { + None + }; + BatchKnnExtra::WithSlimBatch { + slim_batch: Arc::clone(slim_batch), + row_index, + vector_row, + } + } else { + BatchKnnExtra::RowIdOnly + }; let candidate = BatchKnnCandidate { query_index, distance, row_id, - batch: batch.clone(), - row_index, + extra, }; if heap.len() < k { heap.push(candidate); - } else if heap - .peek() - .is_some_and(|worst| candidate.cmp(worst).is_lt()) - { + } else { heap.pop(); heap.push(candidate); } @@ -436,20 +802,14 @@ impl KNNVectorDistanceExec { let mut query_indices = Int32Builder::with_capacity(results.len()); let mut distances = Float32Builder::with_capacity(results.len()); - let mut row_batches = Vec::with_capacity(results.len()); - for result in results { + for result in &results { query_indices.append_value(result.query_index); distances.append_value(result.distance); - let indices = UInt32Array::from(vec![result.row_index]); - row_batches.push( - arrow_select::take::take_record_batch(&result.batch, &indices).map_err(|e| { - DataFusionError::ArrowError(Box::new(e), Some("take top-k row".to_string())) - })?, - ); } - let output = concat_batches(&input_schema, &row_batches) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let output = + Self::assemble_batch_output(&results, stored_schema.as_ref(), &column, retain_vector)?; + output .try_with_column_at(0, query_index_field(), Arc::new(query_indices.finish())) .and_then(|batch| { @@ -501,6 +861,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { lower_bound: self.lower_bound, upper_bound: self.upper_bound, distance_type: self.distance_type, + retain_vector: self.retain_vector, }, )?)) } @@ -515,7 +876,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { let stream = stream::once(Self::execute_batch( input_stream, BatchKnnConfig { - input_schema: self.input_schema.clone(), + stored_schema: self.input_schema.clone(), output_schema: self.output_schema.clone(), column: self.column.clone(), query: self.query.clone(), @@ -524,6 +885,7 @@ impl ExecutionPlan for KNNVectorDistanceExec { lower_bound: self.lower_bound, upper_bound: self.upper_bound, distance_type: self.distance_type, + retain_vector: self.retain_vector, }, )); let schema = self.schema(); @@ -586,35 +948,41 @@ impl ExecutionPlan for KNNVectorDistanceExec { fn partition_statistics(&self, partition: Option) -> DataFusionResult { let inner_stats = self.input.partition_statistics(partition)?; - let schema = self.input.schema(); - let dist_stats = inner_stats + let input_schema = self.input.schema(); + let input_stats_by_name = inner_stats .column_statistics .iter() - .zip(schema.fields()) - .find(|(_, field)| field.name() == &self.column) - .map(|(stats, _)| ColumnStatistics { + .zip(input_schema.fields()) + .map(|(stats, field)| (field.name().as_str(), stats.clone())) + .collect::>(); + let vector_root = lance_core::datatypes::parse_field_path(&self.column) + .ok() + .and_then(|parts| parts.first().cloned()) + .unwrap_or_else(|| self.column.clone()); + let dist_stats = input_stats_by_name + .get(vector_root.as_str()) + .map(|stats| ColumnStatistics { null_count: stats.null_count, ..Default::default() }) .unwrap_or_default(); - let column_statistics = inner_stats - .column_statistics - .into_iter() - .zip(schema.fields()) - .filter(|(_, field)| field.name() != DIST_COL) - .map(|(stats, _)| stats) + let column_statistics = self + .output_schema + .fields() + .iter() + .map(|field| { + if field.name() == QUERY_INDEX_COL { + ColumnStatistics::default() + } else if field.name() == DIST_COL { + dist_stats.clone() + } else { + input_stats_by_name + .get(field.name().as_str()) + .cloned() + .unwrap_or_default() + } + }) .collect::>(); - let column_statistics = if self.is_batch { - std::iter::once(ColumnStatistics::default()) - .chain(column_statistics) - .chain(std::iter::once(dist_stats)) - .collect::>() - } else { - column_statistics - .into_iter() - .chain(std::iter::once(dist_stats)) - .collect::>() - }; Ok(Statistics { num_rows: inner_stats.num_rows, column_statistics, @@ -646,8 +1014,37 @@ struct BatchKnnCandidate { query_index: i32, distance: f32, row_id: u64, - batch: RecordBatch, - row_index: u32, + extra: BatchKnnExtra, +} + +#[derive(Clone)] +enum BatchKnnExtra { + RowIdOnly, + WithSlimBatch { + slim_batch: Arc, + row_index: u32, + vector_row: Option, + }, +} + +fn would_enter_heap( + heap: &BinaryHeap, + k: usize, + distance: f32, + row_id: u64, + query_index: i32, +) -> bool { + if heap.len() < k { + return true; + } + let worst = heap.peek().expect("heap non-empty when len >= k"); + let probe = BatchKnnCandidate { + query_index, + distance, + row_id, + extra: BatchKnnExtra::RowIdOnly, + }; + probe.cmp(worst).is_lt() } impl PartialEq for BatchKnnCandidate { @@ -655,7 +1052,6 @@ impl PartialEq for BatchKnnCandidate { self.query_index == other.query_index && self.distance == other.distance && self.row_id == other.row_id - && self.row_index == other.row_index } } @@ -673,7 +1069,6 @@ impl Ord for BatchKnnCandidate { .total_cmp(&other.distance) .then_with(|| self.row_id.cmp(&other.row_id)) .then_with(|| self.query_index.cmp(&other.query_index)) - .then_with(|| self.row_index.cmp(&other.row_index)) } } @@ -1898,6 +2293,7 @@ mod tests { use arrow::datatypes::Float32Type; use arrow_array::{ ArrayRef, FixedSizeListArray, Float32Array, Int32Array, RecordBatchIterator, StringArray, + StructArray, }; use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; use async_trait::async_trait; @@ -2684,6 +3080,254 @@ mod tests { ); } + #[test] + fn test_batch_partition_statistics_aligns_with_output_schema() { + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ), + ROW_ID_FIELD.clone(), + ])); + let batch = RecordBatch::new_empty(schema); + let input: Arc = Arc::new(TestingExec::new(vec![batch])); + let query = Arc::new(Float32Array::from(vec![0.0, 1.0, 2.0, 3.0])) as ArrayRef; + let plan = KNNVectorDistanceExec::try_new_batch( + input, + "vec", + query, + KnnBatchParams { + is_batch: true, + query_count: 1, + k: 2, + lower_bound: None, + upper_bound: None, + distance_type: DistanceType::L2, + retain_vector: false, + }, + ) + .unwrap(); + let stats = plan.partition_statistics(None).unwrap(); + assert_eq!( + stats.column_statistics.len(), + plan.schema().fields().len(), + "partition stats must align with output schema" + ); + let schema = plan.schema(); + let query_index_pos = schema + .column_with_name(QUERY_INDEX_COL) + .expect("query_index must exist") + .0; + let dist_pos = schema + .column_with_name(DIST_COL) + .expect("distance must exist") + .0; + assert_eq!( + stats.column_statistics[query_index_pos], + ColumnStatistics::default(), + ); + assert_eq!( + stats.column_statistics[dist_pos].null_count, + stats.column_statistics[schema.column_with_name("i").unwrap().0].null_count, + "distance null-count should be derived from vector/input nullability and remain aligned" + ); + } + + #[test] + fn test_remove_vector_from_schema_nested_path() { + let payload_field = ArrowField::new( + "payload", + DataType::Struct( + vec![ + ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ), + ArrowField::new("tag", DataType::Utf8, true), + ] + .into(), + ), + true, + ); + let schema = ArrowSchema::new(vec![ + ArrowField::new("i", DataType::Int32, true), + payload_field, + ROW_ID_FIELD.clone(), + ]); + let without_vec = + KNNVectorDistanceExec::remove_vector_from_schema(&schema, "payload.vec").unwrap(); + let payload = without_vec.field_with_name("payload").unwrap(); + let DataType::Struct(children) = payload.data_type() else { + panic!("payload should remain struct"); + }; + assert!(children.iter().all(|f| f.name() != "vec")); + assert!(children.iter().any(|f| f.name() == "tag")); + } + + #[test] + fn test_take_vector_row_copies_single_row() { + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..12).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let row = KNNVectorDistanceExec::take_vector_row(&vectors, 2).unwrap(); + assert_eq!(row.len(), 1); + assert_eq!( + row.to_data().offset(), + 0, + "take/copy should not retain row offset into the full input buffer" + ); + } + + #[test] + fn test_resolve_vector_column_supports_escaped_nested_path() { + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..8).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let payload = StructArray::from(vec![(Arc::new(vec_field), Arc::new(vectors) as ArrayRef)]); + let batch = RecordBatch::try_new(schema, vec![Arc::new(payload)]).unwrap(); + let vector = + KNNVectorDistanceExec::resolve_vector_column(&batch, "payload.`vec.with.dot`").unwrap(); + assert_eq!(vector.len(), 2); + } + + #[test] + fn test_remove_vector_from_batch_nested_keeps_siblings() { + let vec_field = ArrowField::new( + "vec.with.dot", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let tag_field = ArrowField::new("tag", DataType::Utf8, true); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone(), tag_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..8).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let tags = StringArray::from(vec!["a", "b"]); + let payload = StructArray::from(vec![ + (Arc::new(vec_field), Arc::new(vectors) as ArrayRef), + (Arc::new(tag_field), Arc::new(tags) as ArrayRef), + ]); + let batch = RecordBatch::try_new(schema, vec![Arc::new(payload)]).unwrap(); + + let slim = + KNNVectorDistanceExec::remove_vector_from_batch(&batch, "payload.`vec.with.dot`") + .unwrap(); + let payload = slim.column_by_name("payload").unwrap().as_struct(); + assert!(payload.column_by_name("vec.with.dot").is_none()); + assert!(payload.column_by_name("tag").is_some()); + } + + #[test] + fn test_assemble_batch_output_retained_nested_vector_keeps_sibling_values() { + let vec_field = ArrowField::new( + "vec", + DataType::FixedSizeList( + Arc::new(ArrowField::new("item", DataType::Float32, true)), + 4, + ), + true, + ); + let tag_field = ArrowField::new("tag", DataType::Utf8, true); + let payload_field = ArrowField::new( + "payload", + DataType::Struct(vec![vec_field.clone(), tag_field.clone()].into()), + true, + ); + let schema = Arc::new(ArrowSchema::new(vec![payload_field, ROW_ID_FIELD.clone()])); + let vectors = FixedSizeListArray::try_new_from_values( + Float32Array::from((0..12).map(|v| v as f32).collect::>()), + 4, + ) + .unwrap(); + let tags = StringArray::from(vec!["a", "b", "c"]); + let payload = StructArray::from(vec![ + (Arc::new(vec_field), Arc::new(vectors) as ArrayRef), + (Arc::new(tag_field), Arc::new(tags) as ArrayRef), + ]); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(payload) as ArrayRef, + Arc::new(UInt64Array::from(vec![10, 11, 12])) as ArrayRef, + ], + ) + .unwrap(); + let slim_batch = Arc::new( + KNNVectorDistanceExec::remove_vector_from_batch(&batch, "payload.vec").unwrap(), + ); + let vectors = KNNVectorDistanceExec::resolve_vector_column(&batch, "payload.vec").unwrap(); + let results = [2, 0] + .into_iter() + .map(|row_index| BatchKnnCandidate { + query_index: 0, + distance: row_index as f32, + row_id: 10 + row_index as u64, + extra: BatchKnnExtra::WithSlimBatch { + slim_batch: Arc::clone(&slim_batch), + row_index, + vector_row: Some( + KNNVectorDistanceExec::take_vector_row(vectors.as_ref(), row_index) + .unwrap(), + ), + }, + }) + .collect::>(); + + let output = KNNVectorDistanceExec::assemble_batch_output( + &results, + schema.as_ref(), + "payload.vec", + true, + ) + .unwrap(); + + let payload = output.column_by_name("payload").unwrap().as_struct(); + let tags = payload.column_by_name("tag").unwrap().as_string::(); + assert!(tags.is_valid(0)); + assert!(tags.is_valid(1)); + assert_eq!(tags.value(0), "c"); + assert_eq!(tags.value(1), "a"); + let vectors = payload.column_by_name("vec").unwrap(); + assert_eq!(vectors.len(), 2); + } + #[tokio::test] async fn test_multivector_score() { let query = Query { From f9690bac32228c5437a4012059810926f75b2eb2 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 22 Jun 2026 08:14:07 -0700 Subject: [PATCH 158/177] refactor: remove as_vector_index from the Index trait (#7392) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The method was never called — only defined in the trait and implemented by all 20+ implementors as boilerplate. Callers that need a VectorIndex can downcast via as_any() instead. --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance-index/src/frag_reuse.rs | 6 ------ rust/lance-index/src/lib.rs | 3 --- rust/lance-index/src/mem_wal.rs | 6 ------ rust/lance-index/src/scalar/bitmap.rs | 6 ------ rust/lance-index/src/scalar/bloomfilter.rs | 7 ------- rust/lance-index/src/scalar/btree.rs | 6 ------ rust/lance-index/src/scalar/fmindex.rs | 6 ------ rust/lance-index/src/scalar/inverted/index.rs | 6 ------ rust/lance-index/src/scalar/json.rs | 4 ---- rust/lance-index/src/scalar/label_list.rs | 6 ------ rust/lance-index/src/scalar/ngram.rs | 7 ------- rust/lance-index/src/scalar/rtree.rs | 7 ------- rust/lance-index/src/scalar/zonemap.rs | 7 ------- rust/lance-index/src/vector/hnsw/index.rs | 5 ----- rust/lance/src/index/scalar_logical.rs | 7 ------- rust/lance/src/index/vector/fixture_test.rs | 4 ---- rust/lance/src/index/vector/ivf.rs | 4 ---- rust/lance/src/index/vector/ivf/v2.rs | 4 ---- rust/lance/src/index/vector/pq.rs | 4 ---- rust/lance/src/io/exec/knn.rs | 8 -------- rust/lance/src/session/index_extension.rs | 4 ---- 21 files changed, 117 deletions(-) diff --git a/rust/lance-index/src/frag_reuse.rs b/rust/lance-index/src/frag_reuse.rs index d09d8dc0684..d42b41ca9f0 100644 --- a/rust/lance-index/src/frag_reuse.rs +++ b/rust/lance-index/src/frag_reuse.rs @@ -34,12 +34,6 @@ impl Index for FragReuseIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "FragReuseIndex is not a vector index".into(), - )) - } - fn statistics(&self) -> Result { let stats = FragReuseStatistics { num_versions: self.details.versions.len(), diff --git a/rust/lance-index/src/lib.rs b/rust/lance-index/src/lib.rs index 20e1c2692d9..61b45550367 100644 --- a/rust/lance-index/src/lib.rs +++ b/rust/lance-index/src/lib.rs @@ -85,9 +85,6 @@ pub trait Index: Send + Sync + DeepSizeOf { /// Cast to [Index] fn as_index(self: Arc) -> Arc; - /// Cast to [vector::VectorIndex] - fn as_vector_index(self: Arc) -> Result>; - /// Retrieve index statistics as a JSON Value fn statistics(&self) -> Result; diff --git a/rust/lance-index/src/mem_wal.rs b/rust/lance-index/src/mem_wal.rs index f8f42093894..9bd72ff7866 100644 --- a/rust/lance-index/src/mem_wal.rs +++ b/rust/lance-index/src/mem_wal.rs @@ -38,12 +38,6 @@ impl Index for MemWalIndex { self } - fn as_vector_index(self: Arc) -> lance_core::Result> { - Err(Error::not_supported_source( - "MemWalIndex is not a vector index".into(), - )) - } - fn statistics(&self) -> lance_core::Result { let stats = MemWalStatistics { num_shards: self.details.num_shards, diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index c2a6e80e82b..8729aadbca2 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -551,12 +551,6 @@ impl Index for BitmapIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "BitmapIndex is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { let page_lookup_file = self.lazy_reader.get().await?; let total_rows = page_lookup_file.num_rows(); diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 856f08af772..596ea4cc989 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -29,7 +29,6 @@ use std::{collections::HashMap, sync::Arc}; use crate::scalar::FragReuseIndex; use crate::scalar::{AnyQuery, IndexStore, MetricsCollector, ScalarIndex, SearchResult}; -use crate::vector::VectorIndex; use crate::{Index, IndexType}; use arrow_array::{ArrayRef, RecordBatch}; use async_trait::async_trait; @@ -377,12 +376,6 @@ impl Index for BloomFilterIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input_source( - "BloomFilter is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 85c42e9b048..7668973b44e 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -1998,12 +1998,6 @@ impl Index for BTreeIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "BTreeIndex is not vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { let index_reader = LazyIndexReader::new(self.store.clone(), self.ranges_to_files.clone()); let reader = index_reader.get().await?; diff --git a/rust/lance-index/src/scalar/fmindex.rs b/rust/lance-index/src/scalar/fmindex.rs index cdf19f0304c..79c949a5426 100644 --- a/rust/lance-index/src/scalar/fmindex.rs +++ b/rust/lance-index/src/scalar/fmindex.rs @@ -46,7 +46,6 @@ use crate::scalar::{ AnyQuery, BuiltinIndexType, CreatedIndex, IndexFile, IndexStore, OldIndexDataFilter, ScalarIndex, ScalarIndexParams, SearchResult, TextQuery, UpdateCriteria, }; -use crate::vector::VectorIndex; use crate::{Index, IndexType}; const FMINDEX_INDEX_VERSION: u32 = 10; @@ -1295,11 +1294,6 @@ impl Index for FMIndexScalarIndex { fn as_index(self: Arc) -> Arc { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input_source( - "Fm is not a vector index".into(), - )) - } async fn prewarm(&self) -> Result<()> { Ok(()) } diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 41a18c3bd68..8185872caf9 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -986,12 +986,6 @@ impl Index for InvertedIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input( - "inverted index cannot be cast to vector index", - )) - } - fn statistics(&self) -> Result { let num_tokens = self .partitions diff --git a/rust/lance-index/src/scalar/json.rs b/rust/lance-index/src/scalar/json.rs index 7adf055db61..b20eb10ff20 100644 --- a/rust/lance-index/src/scalar/json.rs +++ b/rust/lance-index/src/scalar/json.rs @@ -76,10 +76,6 @@ impl Index for JsonIndex { self } - fn as_vector_index(self: Arc) -> Result> { - unimplemented!() - } - fn index_type(&self) -> IndexType { // TODO: This causes the index to appear as btree in list_indices call. Need better logic // in list_indices to use details instead of index_type. diff --git a/rust/lance-index/src/scalar/label_list.rs b/rust/lance-index/src/scalar/label_list.rs index 8e07a607bff..77232952419 100644 --- a/rust/lance-index/src/scalar/label_list.rs +++ b/rust/lance-index/src/scalar/label_list.rs @@ -110,12 +110,6 @@ impl Index for LabelListIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "LabeListIndex is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { self.values_index.prewarm().await } diff --git a/rust/lance-index/src/scalar/ngram.rs b/rust/lance-index/src/scalar/ngram.rs index b452ef78c85..582c7aab157 100644 --- a/rust/lance-index/src/scalar/ngram.rs +++ b/rust/lance-index/src/scalar/ngram.rs @@ -24,7 +24,6 @@ use crate::scalar::registry::{ VALUE_COLUMN_NAME, }; use crate::scalar::{CreatedIndex, UpdateCriteria}; -use crate::vector::VectorIndex; use crate::{Index, IndexType}; use arrow::array::{AsArray, UInt32Builder}; use arrow::datatypes::{UInt32Type, UInt64Type}; @@ -397,12 +396,6 @@ impl Index for NGramIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input_source( - "NGramIndex is not a vector index".into(), - )) - } - fn statistics(&self) -> Result { let ngram_stats = NGramStatistics { num_ngrams: self.tokens.len(), diff --git a/rust/lance-index/src/scalar/rtree.rs b/rust/lance-index/src/scalar/rtree.rs index 5d5ac2a3a92..e26177b4b4a 100644 --- a/rust/lance-index/src/scalar/rtree.rs +++ b/rust/lance-index/src/scalar/rtree.rs @@ -13,7 +13,6 @@ use crate::scalar::{ AnyQuery, BuiltinIndexType, CreatedIndex, GeoQuery, IndexFile, IndexReader, IndexStore, IndexWriter, ScalarIndex, ScalarIndexParams, SearchResult, UpdateCriteria, }; -use crate::vector::VectorIndex; use crate::{Index, IndexType, pb}; use arrow_array::UInt32Array; use arrow_array::cast::AsArray; @@ -449,12 +448,6 @@ impl Index for RTreeIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::not_supported_source( - "RTreeIndex is not vector index".into(), - )) - } - fn statistics(&self) -> Result { serde_json::to_value(self.metadata.clone()) .map_err(|e| Error::internal(format!("Error serializing statistics: {}", e))) diff --git a/rust/lance-index/src/scalar/zonemap.rs b/rust/lance-index/src/scalar/zonemap.rs index 8e7e20c211a..af5380cce30 100644 --- a/rust/lance-index/src/scalar/zonemap.rs +++ b/rust/lance-index/src/scalar/zonemap.rs @@ -37,7 +37,6 @@ use std::{collections::HashMap, sync::Arc}; use super::{AnyQuery, IndexStore, MetricsCollector, ScalarIndex, SearchResult}; use crate::scalar::FragReuseIndex; -use crate::vector::VectorIndex; use crate::{Index, IndexType}; use async_trait::async_trait; use lance_core::Error; @@ -548,12 +547,6 @@ impl Index for ZoneMapIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input_source( - "ZoneMapIndex is not a vector index".into(), - )) - } - async fn prewarm(&self) -> Result<()> { // Not much to prewarm Ok(()) diff --git a/rust/lance-index/src/vector/hnsw/index.rs b/rust/lance-index/src/vector/hnsw/index.rs index 0ae42f59414..c8c9e5164fe 100644 --- a/rust/lance-index/src/vector/hnsw/index.rs +++ b/rust/lance-index/src/vector/hnsw/index.rs @@ -119,11 +119,6 @@ impl Index for HNSWIndex { self } - /// Cast to [VectorIndex] - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - /// Retrieve index statistics as a JSON Value fn statistics(&self) -> Result { Ok(json!({ diff --git a/rust/lance/src/index/scalar_logical.rs b/rust/lance/src/index/scalar_logical.rs index f3a7b637202..8ef86a6cb5f 100644 --- a/rust/lance/src/index/scalar_logical.rs +++ b/rust/lance/src/index/scalar_logical.rs @@ -86,13 +86,6 @@ impl Index for LogicalScalarIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Err(Error::invalid_input(format!( - "LogicalScalarIndex '{}' is not a vector index", - self.name - ))) - } - fn statistics(&self) -> Result { Ok(json!({ "index_name": self.name, diff --git a/rust/lance/src/index/vector/fixture_test.rs b/rust/lance/src/index/vector/fixture_test.rs index 91d5c434dd1..1b82a7f6941 100644 --- a/rust/lance/src/index/vector/fixture_test.rs +++ b/rust/lance/src/index/vector/fixture_test.rs @@ -71,10 +71,6 @@ mod test { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } diff --git a/rust/lance/src/index/vector/ivf.rs b/rust/lance/src/index/vector/ivf.rs index fb01339ead9..5d477a2e8dd 100644 --- a/rust/lance/src/index/vector/ivf.rs +++ b/rust/lance/src/index/vector/ivf.rs @@ -1088,10 +1088,6 @@ impl Index for IVFIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn index_type(&self) -> IndexType { if self.sub_index.as_any().downcast_ref::().is_some() { IndexType::IvfPq diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs index 5b29752f7c1..40227d2d020 100644 --- a/rust/lance/src/index/vector/ivf/v2.rs +++ b/rust/lance/src/index/vector/ivf/v2.rs @@ -1288,10 +1288,6 @@ impl Index for IVFIndex) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { futures::stream::iter(0..self.ivf.num_partitions()) .map(Ok) diff --git a/rust/lance/src/index/vector/pq.rs b/rust/lance/src/index/vector/pq.rs index a661a314b4d..6e335cddc80 100644 --- a/rust/lance/src/index/vector/pq.rs +++ b/rust/lance/src/index/vector/pq.rs @@ -180,10 +180,6 @@ impl Index for PQIndex { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn index_type(&self) -> IndexType { IndexType::Vector } diff --git a/rust/lance/src/io/exec/knn.rs b/rust/lance/src/io/exec/knn.rs index 73e901aee04..01125ac1617 100644 --- a/rust/lance/src/io/exec/knn.rs +++ b/rust/lance/src/io/exec/knn.rs @@ -2416,10 +2416,6 @@ mod tests { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn statistics(&self) -> Result { Ok(serde_json::json!({})) } @@ -2540,10 +2536,6 @@ mod tests { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - fn statistics(&self) -> Result { Ok(serde_json::json!({})) } diff --git a/rust/lance/src/session/index_extension.rs b/rust/lance/src/session/index_extension.rs index de9e61b5f8f..301213c6f06 100644 --- a/rust/lance/src/session/index_extension.rs +++ b/rust/lance/src/session/index_extension.rs @@ -111,10 +111,6 @@ mod test { self } - fn as_vector_index(self: Arc) -> Result> { - Ok(self) - } - async fn prewarm(&self) -> Result<()> { Ok(()) } From 2b1b10016bf6e703e70d93d96f797202e48704c9 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Mon, 22 Jun 2026 23:58:01 +0800 Subject: [PATCH 159/177] fix(fts): enforce required terms for and queries (#7385) ## Bug Fix ### What is the bug? FTS `AND` queries could return matches from a partition that only contained a subset of the required query terms. For fuzzy `AND`, expansions were also flattened without preserving the original query-position grouping, so missing required positions and same-position expansion scoring could produce incorrect results. ### What issues or incorrect behavior does the bug cause? A query such as `alpha AND beta` could return rows from a partition that only had `alpha` because the missing term was skipped before WAND saw the query. Fuzzy `AND` could also treat expansions from one original position as separate required terms, or score grouped expansions using the wrong token IDF, which could affect top-k ordering. ### How does this PR fix the problem? This PR makes partition posting-list loading aware of the query operator. For `AND`, a partition now returns empty results when any required original position has no exact term or fuzzy expansion. For fuzzy `AND`, expansions are grouped by original query position, same-position expansions are unioned for candidate selection, and final scoring uses the actual matched expansion token frequencies. ## Validation - `cargo fmt --all --check` - `git diff --check` - `CARGO_TARGET_DIR=... cargo test -p lance-index test_fuzzy_and_scores_grouped_expansions_by_matched_token -- --nocapture` - `CARGO_TARGET_DIR=... cargo test -p lance-index test_and_query -- --nocapture` - `CARGO_TARGET_DIR=... cargo test -p lance-index test_fuzzy_and_groups_expansions_by_original_position -- --nocapture` - `CARGO_TARGET_DIR=... cargo test -p lance-index bm25_search -- --nocapture` - `CARGO_TARGET_DIR=... cargo test -p lance-index scalar::inverted::wand::tests -- --nocapture` --- rust/lance-index/src/scalar/inverted/index.rs | 838 +++++++++++++++++- rust/lance-index/src/scalar/inverted/wand.rs | 58 +- 2 files changed, 840 insertions(+), 56 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 8185872caf9..ac13fc0c585 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -9,7 +9,7 @@ use std::{ collections::BinaryHeap, }; use std::{ - collections::{HashMap, HashSet}, + collections::{BTreeMap, HashMap, HashSet}, ops::Range, time::Instant, }; @@ -207,6 +207,7 @@ impl FromStr for InvertedListFormatVersion { #[derive(Debug)] struct PartitionCandidates { tokens_by_position: Vec, + grouped_expansions: Vec, candidates: Vec, } @@ -214,11 +215,74 @@ impl PartitionCandidates { fn empty() -> Self { Self { tokens_by_position: Vec::new(), + grouped_expansions: Vec::new(), candidates: Vec::new(), } } } +#[derive(Debug)] +struct LoadedPostings { + postings: Vec, + grouped_expansions: Vec, +} + +impl LoadedPostings { + fn empty() -> Self { + Self { + postings: Vec::new(), + grouped_expansions: Vec::new(), + } + } +} + +#[derive(Debug)] +struct GroupedExpansionTerms { + position: u32, + terms: Vec, +} + +fn grouped_rescore_wand_limit( + limit: Option, + grouped_expansions: &[GroupedExpansionTerms], +) -> Option { + let limit = limit?; + // Grouped fuzzy AND rescoring needs a small candidate cushion because WAND + // ranks by the unioned group posting first and the exact expansion IDF later. + let expansion_terms = grouped_expansions + .iter() + .map(|group| group.terms.len()) + .sum::() + .max(1); + Some(limit.saturating_mul(expansion_terms)) +} + +#[derive(Debug)] +struct ExpansionTermFreqs { + token: String, + freqs_by_posting_doc_id: Vec<(u64, u32)>, +} + +impl ExpansionTermFreqs { + fn new(token: String, posting: &PostingList) -> Self { + let freqs_by_posting_doc_id = posting + .iter() + .map(|(posting_doc_id, freq, _)| (posting_doc_id, freq)) + .collect(); + Self { + token, + freqs_by_posting_doc_id, + } + } + + fn frequency(&self, posting_doc_id: u64) -> Option { + self.freqs_by_posting_doc_id + .binary_search_by_key(&posting_doc_id, |(doc_id, _)| *doc_id) + .ok() + .map(|idx| self.freqs_by_posting_doc_id[idx].1) + } +} + #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)] pub enum TokenSetFormat { Arrow, @@ -653,9 +717,10 @@ impl InvertedIndex { let expanded = partition.expand_fuzzy(tokens, params)?; for idx in 0..expanded.len() { let token = expanded.get_token(idx); - if seen.insert(token.to_string()) { + let position = expanded.position(idx); + if seen.insert((token.to_string(), position)) { expanded_tokens.push(token.to_string()); - expanded_positions.push(expanded.position(idx)); + expanded_positions.push(position); } } } @@ -717,9 +782,18 @@ impl InvertedIndex { let metrics = metrics.clone(); let shared_threshold = shared_threshold.clone(); async move { - let postings = part - .load_posting_lists(tokens.as_ref(), params.as_ref(), metrics.as_ref()) + let loaded_postings = part + .load_posting_lists( + tokens.as_ref(), + params.as_ref(), + operator, + metrics.as_ref(), + ) .await?; + let LoadedPostings { + postings, + grouped_expansions, + } = loaded_postings; if postings.is_empty() { // No hits in this partition; its DocSet stays // unloaded, so we never pay the per-doc @@ -741,22 +815,38 @@ impl InvertedIndex { let mask = mask.clone(); let metrics = metrics.clone(); let part_for_wand = part.clone(); - let mut partition_result = spawn_cpu(move || { + let has_grouped_expansions = !grouped_expansions.is_empty(); + let wand_params = if has_grouped_expansions { + let mut rescoring_params = params.as_ref().clone(); + rescoring_params.limit = + grouped_rescore_wand_limit(params.limit, &grouped_expansions); + Arc::new(rescoring_params) + } else { + params.clone() + }; + let partition_threshold = if has_grouped_expansions { + Arc::new(AtomicU32::new(f32::NEG_INFINITY.to_bits())) + } else { + shared_threshold + }; + let candidates = spawn_cpu(move || { let candidates = part_for_wand.bm25_search( docs_for_wand.as_ref(), - params.as_ref(), + wand_params.as_ref(), operator, mask, postings, metrics.as_ref(), - shared_threshold, + partition_threshold, )?; - std::result::Result::<_, Error>::Ok(PartitionCandidates { - tokens_by_position, - candidates, - }) + std::result::Result::<_, Error>::Ok(candidates) }) .await?; + let mut partition_result = PartitionCandidates { + tokens_by_position, + grouped_expansions, + candidates, + }; resolve_deferred_candidates(&part.docs, &mut partition_result.candidates) .await?; Result::Ok(partition_result) @@ -769,8 +859,17 @@ impl InvertedIndex { if res.candidates.is_empty() { continue; } - let mut idf_by_position = Vec::with_capacity(res.tokens_by_position.len()); - for token in &res.tokens_by_position { + let PartitionCandidates { + tokens_by_position, + grouped_expansions, + candidates: part_candidates, + } = res; + let grouped_positions = grouped_expansions + .iter() + .map(|group| group.position) + .collect::>(); + let mut idf_by_position = Vec::with_capacity(tokens_by_position.len()); + for token in &tokens_by_position { let idf_weight = match idf_cache.get(token) { Some(weight) => *weight, None => { @@ -783,9 +882,10 @@ impl InvertedIndex { } for DocCandidate { addr, + posting_doc_id, freqs, doc_length, - } in res.candidates + } in part_candidates { // resolve_deferred_candidates ran upstream, so every // candidate carries a real row_id at this point. @@ -799,10 +899,29 @@ impl InvertedIndex { }; let mut score = 0.0; for (term_index, freq) in freqs.into_iter() { + if grouped_positions.contains(&term_index) { + continue; + } debug_assert!((term_index as usize) < idf_by_position.len()); score += idf_by_position[term_index as usize] * scorer.doc_weight(freq, doc_length); } + for group in &grouped_expansions { + for term in &group.terms { + let Some(freq) = term.frequency(posting_doc_id) else { + continue; + }; + let idf_weight = match idf_cache.get(&term.token) { + Some(weight) => *weight, + None => { + let weight = scorer.query_weight(&term.token); + idf_cache.insert(term.token.clone(), weight); + weight + } + }; + score += idf_weight * scorer.doc_weight(freq, doc_length); + } + } if candidates.len() < limit { candidates.push(Reverse(ScoredDoc::new(row_id, score))); } else if candidates.peek().unwrap().0.score.0 < score { @@ -1291,7 +1410,14 @@ impl InvertedPartition { pub fn expand_fuzzy(&self, tokens: &Tokens, params: &FtsSearchParams) -> Result { let mut new_tokens = Vec::with_capacity(min(tokens.len(), params.max_expansions)); - for token in tokens { + let mut new_positions = Vec::with_capacity(new_tokens.capacity()); + let mut seen = HashSet::new(); + for token_idx in 0..tokens.len() { + if new_tokens.len() >= params.max_expansions { + break; + } + let token = tokens.get_token(token_idx); + let position = tokens.position(token_idx); let fuzziness = match params.fuzziness { Some(fuzziness) => fuzziness, None => MatchQuery::auto_fuzziness(token), @@ -1301,39 +1427,150 @@ impl InvertedPartition { let base_len = tokens.token_type().prefix_len(token) as u32; if let TokenMap::Fst(ref map) = self.tokens.tokens { + let mut expanded = Vec::new(); + let remaining = params.max_expansions - new_tokens.len(); match base_len + params.prefix_length { - 0 => take_fst_keys(map.search(lev), &mut new_tokens, params.max_expansions), + 0 => take_fst_keys(map.search(lev), &mut expanded, remaining), prefix_length => { let prefix = &token[..min(prefix_length as usize, token.len())]; let prefix = fst::automaton::Str::new(prefix).starts_with(); take_fst_keys( map.search(lev.intersection(prefix)), - &mut new_tokens, - params.max_expansions, + &mut expanded, + remaining, ) } } + for token in expanded { + if seen.insert((token.clone(), position)) { + new_tokens.push(token); + new_positions.push(position); + if new_tokens.len() >= params.max_expansions { + break; + } + } + } } else { return Err(Error::index( "tokens is not fst, which is not expected".to_owned(), )); } } - Ok(Tokens::new(new_tokens, tokens.token_type().clone())) + Ok(Tokens::with_positions( + new_tokens, + new_positions, + tokens.token_type().clone(), + )) + } + + fn union_plain_posting_lists(postings: Vec) -> Result { + let mut freqs_by_row_id = BTreeMap::new(); + for posting in postings { + for (row_id, freq, _) in posting.iter() { + let entry = freqs_by_row_id.entry(row_id).or_insert(0u32); + *entry = entry.checked_add(freq).ok_or_else(|| { + Error::index(format!("posting frequency overflow for row id {}", row_id)) + })?; + } + } + let mut row_ids = Vec::with_capacity(freqs_by_row_id.len()); + let mut frequencies = Vec::with_capacity(freqs_by_row_id.len()); + for (row_id, freq) in freqs_by_row_id { + row_ids.push(row_id); + frequencies.push(freq as f32); + } + Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(row_ids), + ScalarBuffer::from(frequencies), + None, + None, + ))) + } + + fn union_compressed_posting_lists( + postings: Vec, + docs: &DocSet, + ) -> Result { + let mut freqs_by_doc_id = BTreeMap::new(); + for posting in postings { + for (doc_id, freq, _) in posting.iter() { + let doc_id = u32::try_from(doc_id).map_err(|_| { + Error::index(format!( + "compressed posting doc id {} exceeds u32::MAX", + doc_id + )) + })?; + let entry = freqs_by_doc_id.entry(doc_id).or_insert(0u32); + *entry = entry.checked_add(freq).ok_or_else(|| { + Error::index(format!("posting frequency overflow for doc id {}", doc_id)) + })?; + } + } + if freqs_by_doc_id.is_empty() { + return Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(Vec::::new()), + ScalarBuffer::from(Vec::::new()), + None, + None, + ))); + } + + let mut builder = PostingListBuilder::new(false); + let mut doc_ids = Vec::with_capacity(freqs_by_doc_id.len()); + let mut frequencies = Vec::with_capacity(freqs_by_doc_id.len()); + for (doc_id, freq) in freqs_by_doc_id { + builder.add(doc_id, PositionRecorder::Count(freq)); + doc_ids.push(doc_id); + frequencies.push(freq); + } + let block_max_scores = docs.calculate_block_max_scores(doc_ids.iter(), frequencies.iter()); + let batch = builder.to_batch(block_max_scores)?; + let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); + let length = batch[LENGTH_COL].as_primitive::().value(0); + PostingList::from_batch(&batch, Some(max_score), Some(length)) + } + + fn union_posting_lists(postings: Vec, docs: &DocSet) -> Result { + let has_plain = postings + .iter() + .any(|posting| matches!(posting, PostingList::Plain(_))); + let has_compressed = postings + .iter() + .any(|posting| matches!(posting, PostingList::Compressed(_))); + match (has_plain, has_compressed) { + (true, true) => Err(Error::index( + "cannot union mixed plain and compressed posting lists".to_owned(), + )), + (true, false) => Self::union_plain_posting_lists(postings), + (false, true) => Self::union_compressed_posting_lists(postings, docs), + (false, false) => Ok(PostingList::Plain(PlainPostingList::new( + ScalarBuffer::from(Vec::::new()), + ScalarBuffer::from(Vec::::new()), + None, + None, + ))), + } } // search the documents that contain the query // return the doc info and the doc length // ref: https://en.wikipedia.org/wiki/Okapi_BM25 #[instrument(level = "debug", skip_all)] - pub async fn load_posting_lists( + async fn load_posting_lists( &self, tokens: &Tokens, params: &FtsSearchParams, + operator: Operator, metrics: &dyn MetricsCollector, - ) -> Result> { + ) -> Result { let is_fuzzy = matches!(params.fuzziness, Some(n) if n != 0); let is_phrase_query = params.phrase_slop.is_some(); + let is_and_query = operator == Operator::And; + let required_positions = (is_and_query || is_phrase_query).then(|| { + (0..tokens.len()) + .map(|index| tokens.position(index)) + .collect::>() + }); let tokens = match is_fuzzy { true => self.expand_fuzzy(tokens, params)?, false => tokens.clone(), @@ -1342,45 +1579,146 @@ impl InvertedPartition { .map(|index| tokens.position(index)) .collect::>(); let mut token_ids = Vec::with_capacity(tokens.len()); + let mut matched_positions = HashSet::new(); for (index, token) in tokens.into_iter().enumerate() { let token_id = self.map(&token); if let Some(token_id) = token_id { - token_ids.push((token_id, token, token_positions[index])); - } else if is_phrase_query { - // if the token is not found, we can't do phrase query - return Ok(Vec::new()); + let position = token_positions[index]; + matched_positions.insert(position); + token_ids.push((token_id, token, position)); + } else if is_phrase_query || is_and_query { + // if the token is not found, we can't do phrase or AND query + return Ok(LoadedPostings::empty()); } } if token_ids.is_empty() { - return Ok(Vec::new()); + return Ok(LoadedPostings::empty()); } + if let Some(required_positions) = required_positions + && !required_positions.is_subset(&matched_positions) + { + return Ok(LoadedPostings::empty()); + } + + let is_fuzzy_and_query = is_fuzzy && is_and_query && !is_phrase_query; if !is_phrase_query { - token_ids.sort_unstable_by_key(|(token_id, _, _)| *token_id); - token_ids.dedup_by_key(|(token_id, _, _)| *token_id); + if is_fuzzy_and_query { + token_ids.sort_unstable_by_key(|(token_id, _, position)| (*position, *token_id)); + token_ids.dedup_by(|lhs, rhs| lhs.0 == rhs.0 && lhs.2 == rhs.2); + } else { + token_ids.sort_unstable_by_key(|(token_id, _, _)| *token_id); + token_ids.dedup_by_key(|(token_id, _, _)| *token_id); + } } let num_docs = self.docs.len(); - stream::iter(token_ids) + let loaded_postings = stream::iter(token_ids) .map(|(token_id, token, position)| async move { let posting = self .inverted_list .posting_list(token_id, is_phrase_query, metrics) .await?; - let query_weight = idf(posting.len(), num_docs); - - Result::Ok(PostingIterator::with_query_weight( - token, - token_id, - position, - query_weight, - posting, - num_docs, - )) + Result::Ok((token_id, token, position, posting)) }) .buffered(self.store.io_parallelism()) .try_collect::>() - .await + .await?; + + if (is_and_query || is_phrase_query) + && !is_fuzzy_and_query + && loaded_postings + .iter() + .any(|(_, _, _, posting)| posting.is_empty()) + { + return Ok(LoadedPostings::empty()); + } + + if !is_fuzzy_and_query { + return Ok(LoadedPostings { + postings: loaded_postings + .into_iter() + .map(|(token_id, token, position, posting)| { + let query_weight = idf(posting.len(), num_docs); + PostingIterator::with_query_weight( + token, + token_id, + position, + query_weight, + posting, + num_docs, + ) + }) + .collect(), + grouped_expansions: Vec::new(), + }); + } + + let needs_union = loaded_postings + .windows(2) + .any(|window| window[0].2 == window[1].2); + let docs_for_union = if needs_union { + Some(self.docs.ensure_num_tokens_loaded().await?) + } else { + None + }; + + // WAND's AND mode treats every iterator as required, so expansions from + // one original query position must be merged before scoring. + let mut grouped_postings = Vec::new(); + let mut grouped_expansions = Vec::new(); + let mut iter = loaded_postings.into_iter().peekable(); + while let Some((token_id, token, position, posting)) = iter.next() { + let mut group = vec![(token_id, token, posting)]; + while matches!(iter.peek(), Some((_, _, next_position, _)) if *next_position == position) + { + let (token_id, token, _, posting) = iter.next().expect("peeked item must exist"); + group.push((token_id, token, posting)); + } + + let (token_id, token, posting) = if group.len() == 1 { + group.pop().expect("single-item group must exist") + } else { + let token_id = group[0].0; + let token = group[0].1.clone(); + grouped_expansions.push(GroupedExpansionTerms { + position, + terms: group + .iter() + .map(|(_, token, posting)| ExpansionTermFreqs::new(token.clone(), posting)) + .collect(), + }); + let postings = group + .into_iter() + .map(|(_, _, posting)| posting) + .collect::>(); + let posting = Self::union_posting_lists( + postings, + docs_for_union + .as_deref() + .expect("union docs must be loaded for grouped fuzzy AND"), + )?; + (token_id, token, posting) + }; + if posting.is_empty() { + return Ok(LoadedPostings::empty()); + } + + let query_weight = idf(posting.len(), num_docs); + grouped_postings.push(PostingIterator::with_query_weight( + token, + token_id, + position, + query_weight, + posting, + num_docs, + )); + } + + Ok(LoadedPostings { + postings: grouped_postings, + grouped_expansions, + }) } #[instrument(level = "debug", skip_all)] @@ -5289,7 +5627,7 @@ mod tests { use lance_core::utils::tempfile::TempObjDir; use lance_io::object_store::ObjectStore; - use crate::metrics::NoOpMetricsCollector; + use crate::metrics::{LocalMetricsCollector, NoOpMetricsCollector}; use crate::prefilter::NoFilter; use crate::scalar::ScalarIndex; use crate::scalar::inverted::builder::{ @@ -5307,6 +5645,7 @@ mod tests { use arrow_schema::{DataType, Field, Schema}; use std::collections::HashMap; use std::sync::Arc; + use std::sync::atomic::Ordering; use crate::scalar::inverted::tokenizer::document_tokenizer::TextTokenizer; use lance_tokenizer::{Language, SimpleTokenizer, StopWordFilter, TextAnalyzer}; @@ -7200,6 +7539,421 @@ mod tests { } } + async fn write_test_metadata( + store: &Arc, + partition_ids: Vec, + params: InvertedIndexParams, + ) { + let metadata = HashMap::from([ + ( + "partitions".to_owned(), + serde_json::to_string(&partition_ids).unwrap(), + ), + ("params".to_owned(), serde_json::to_string(¶ms).unwrap()), + ( + TOKEN_SET_FORMAT_KEY.to_owned(), + TokenSetFormat::default().to_string(), + ), + ]); + let mut writer = store + .new_index_file(METADATA_FILE, Arc::new(arrow_schema::Schema::empty())) + .await + .unwrap(); + writer.finish_with_metadata(metadata).await.unwrap(); + } + + #[tokio::test] + async fn test_and_query_returns_empty_when_exact_term_missing() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let mut builder = InnerBuilder::new(0, false, TokenSetFormat::default()); + builder.tokens.add("alpha".to_owned()); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder.docs.append(100, 1); + builder.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + + let tokens = Arc::new(Tokens::new( + vec!["alpha".to_owned(), "missing".to_owned()], + DocType::Text, + )); + let params = Arc::new(FtsSearchParams::new().with_limit(Some(10))); + let prefilter = Arc::new(NoFilter); + let metrics = Arc::new(NoOpMetricsCollector); + + let (and_row_ids, _) = index + .bm25_search( + tokens.clone(), + params.clone(), + Operator::And, + prefilter.clone(), + metrics.clone(), + None, + ) + .await + .unwrap(); + assert!( + and_row_ids.is_empty(), + "AND must not match when any required term is missing" + ); + + let (or_row_ids, _) = index + .bm25_search(tokens, params, Operator::Or, prefilter, metrics, None) + .await + .unwrap(); + assert_eq!( + or_row_ids, + vec![100], + "OR should still match the present term" + ); + } + + #[tokio::test] + async fn test_and_query_skips_partition_missing_required_term() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let mut builder0 = InnerBuilder::new(0, false, TokenSetFormat::default()); + builder0.tokens.add("alpha".to_owned()); + builder0.posting_lists.push(PostingListBuilder::new(false)); + builder0.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder0.docs.append(100, 1); + builder0.write(store.as_ref()).await.unwrap(); + + let mut builder1 = InnerBuilder::new(1, false, TokenSetFormat::default()); + builder1.tokens.add("alpha".to_owned()); + builder1.tokens.add("beta".to_owned()); + builder1.posting_lists.push(PostingListBuilder::new(false)); + builder1.posting_lists.push(PostingListBuilder::new(false)); + builder1.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder1.posting_lists[1].add(0, PositionRecorder::Count(1)); + builder1.docs.append(200, 2); + builder1.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0, 1], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + + let tokens = Arc::new(Tokens::new( + vec!["alpha".to_owned(), "beta".to_owned()], + DocType::Text, + )); + let params = Arc::new(FtsSearchParams::new().with_limit(Some(10))); + let (mut row_ids, _) = index + .bm25_search( + tokens, + params, + Operator::And, + Arc::new(NoFilter), + Arc::new(NoOpMetricsCollector), + None, + ) + .await + .unwrap(); + row_ids.sort_unstable(); + assert_eq!( + row_ids, + vec![200], + "partition missing beta must not contribute alpha-only hits" + ); + } + + #[tokio::test] + async fn test_fuzzy_and_groups_expansions_by_original_position() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let mut builder = InnerBuilder::new(0, false, TokenSetFormat::default()); + builder.tokens.add("alpha".to_owned()); + builder.tokens.add("alphi".to_owned()); + builder.tokens.add("beta".to_owned()); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder.posting_lists[1].add(1, PositionRecorder::Count(1)); + builder.posting_lists[2].add(0, PositionRecorder::Count(1)); + builder.posting_lists[2].add(1, PositionRecorder::Count(1)); + builder.docs.append(100, 2); + builder.docs.append(101, 2); + builder.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + let params = Arc::new( + FtsSearchParams::new() + .with_limit(Some(10)) + .with_fuzziness(Some(1)), + ); + + let missing_position_tokens = Arc::new(Tokens::new( + vec!["betx".to_owned(), "zzzzz".to_owned()], + DocType::Text, + )); + let (missing_and_row_ids, _) = index + .bm25_search( + missing_position_tokens.clone(), + params.clone(), + Operator::And, + Arc::new(NoFilter), + Arc::new(NoOpMetricsCollector), + None, + ) + .await + .unwrap(); + assert!( + missing_and_row_ids.is_empty(), + "fuzzy AND must require at least one expansion for every original position" + ); + + let (mut or_row_ids, _) = index + .bm25_search( + missing_position_tokens, + params.clone(), + Operator::Or, + Arc::new(NoFilter), + Arc::new(NoOpMetricsCollector), + None, + ) + .await + .unwrap(); + or_row_ids.sort_unstable(); + assert_eq!( + or_row_ids, + vec![100, 101], + "OR should still match present fuzzy expansions" + ); + + let grouped_tokens = Arc::new(Tokens::new( + vec!["alphx".to_owned(), "betx".to_owned()], + DocType::Text, + )); + let (mut grouped_row_ids, _) = index + .bm25_search( + grouped_tokens, + params, + Operator::And, + Arc::new(NoFilter), + Arc::new(NoOpMetricsCollector), + None, + ) + .await + .unwrap(); + grouped_row_ids.sort_unstable(); + assert_eq!( + grouped_row_ids, + vec![100, 101], + "each original fuzzy position should match any one of its expansions" + ); + } + + #[tokio::test] + async fn test_fuzzy_expansion_cap_applies_to_whole_query() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let mut builder = InnerBuilder::new(0, false, TokenSetFormat::default()); + for token in ["alpha", "alphi", "beta", "beti"] { + builder.tokens.add(token.to_owned()); + builder.posting_lists.push(PostingListBuilder::new(false)); + } + for token_id in 0..4 { + builder.posting_lists[token_id].add(token_id as u32, PositionRecorder::Count(1)); + builder.docs.append(100 + token_id as u64, 1); + } + builder.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + let partition = index.partitions[0].clone(); + let params = FtsSearchParams::new() + .with_fuzziness(Some(1)) + .with_max_expansions(3); + let tokens = Tokens::new(vec!["alphx".to_owned(), "betx".to_owned()], DocType::Text); + + let expanded = partition.expand_fuzzy(&tokens, ¶ms).unwrap(); + let expanded_terms = (0..expanded.len()) + .map(|idx| (expanded.get_token(idx).to_owned(), expanded.position(idx))) + .collect::>(); + + assert_eq!( + expanded_terms, + vec![ + ("alpha".to_owned(), 0), + ("alphi".to_owned(), 0), + ("beta".to_owned(), 1), + ], + "max_expansions should cap the whole fuzzy query, not each token" + ); + } + + #[tokio::test] + async fn test_fuzzy_and_scores_grouped_expansions_by_matched_token() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let mut builder = InnerBuilder::new(0, false, TokenSetFormat::default()); + builder.tokens.add("alpha".to_owned()); + builder.tokens.add("alphi".to_owned()); + builder.tokens.add("beta".to_owned()); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder.posting_lists[0].add(2, PositionRecorder::Count(1)); + builder.posting_lists[0].add(3, PositionRecorder::Count(1)); + builder.posting_lists[0].add(4, PositionRecorder::Count(1)); + builder.posting_lists[0].add(5, PositionRecorder::Count(1)); + builder.posting_lists[1].add(1, PositionRecorder::Count(1)); + builder.posting_lists[2].add(0, PositionRecorder::Count(1)); + builder.posting_lists[2].add(1, PositionRecorder::Count(1)); + builder.docs.append(100, 2); + builder.docs.append(101, 2); + builder.docs.append(102, 1); + builder.docs.append(103, 1); + builder.docs.append(104, 1); + builder.docs.append(105, 1); + builder.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + + let tokens = Arc::new(Tokens::new( + vec!["alphx".to_owned(), "betx".to_owned()], + DocType::Text, + )); + let params = Arc::new( + FtsSearchParams::new() + .with_limit(Some(1)) + .with_fuzziness(Some(1)), + ); + let (row_ids, _scores) = index + .bm25_search( + tokens, + params, + Operator::And, + Arc::new(NoFilter), + Arc::new(NoOpMetricsCollector), + None, + ) + .await + .unwrap(); + + assert_eq!( + row_ids, + vec![101], + "the rare matched expansion should outrank the common expansion" + ); + } + + #[tokio::test] + async fn test_fuzzy_and_grouped_rescore_keeps_wand_limit_bounded() { + let tmpdir = TempObjDir::default(); + let store = Arc::new(LanceIndexStore::new( + ObjectStore::local().into(), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + let num_docs = BLOCK_SIZE * 2 + 4; + let mut builder = InnerBuilder::new(0, false, TokenSetFormat::default()); + builder.tokens.add("alpha".to_owned()); + builder.tokens.add("alphi".to_owned()); + builder.tokens.add("beta".to_owned()); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + builder.posting_lists.push(PostingListBuilder::new(false)); + + builder.posting_lists[0].add(0, PositionRecorder::Count(1)); + builder.posting_lists[1].add(1, PositionRecorder::Count(1)); + for doc_id in 0..num_docs { + builder.posting_lists[2].add(doc_id as u32, PositionRecorder::Count(1)); + if doc_id >= 2 { + builder.posting_lists[0].add(doc_id as u32, PositionRecorder::Count(1)); + } + let num_tokens = if doc_id < 2 { 2 } else { 100 }; + builder.docs.append(100 + doc_id as u64, num_tokens); + } + builder.write(store.as_ref()).await.unwrap(); + + write_test_metadata(&store, vec![0], InvertedIndexParams::default()).await; + let cache = Arc::new(LanceCache::with_capacity(4096)); + let index = InvertedIndex::load(store.clone(), None, cache.as_ref()) + .await + .unwrap(); + + let tokens = Arc::new(Tokens::new( + vec!["alphx".to_owned(), "betx".to_owned()], + DocType::Text, + )); + let params = Arc::new( + FtsSearchParams::new() + .with_limit(Some(1)) + .with_fuzziness(Some(1)), + ); + let metrics = Arc::new(LocalMetricsCollector::default()); + let (row_ids, _scores) = index + .bm25_search( + tokens, + params, + Operator::And, + Arc::new(NoFilter), + metrics.clone(), + None, + ) + .await + .unwrap(); + + assert_eq!( + row_ids, + vec![101], + "final rescoring should still rank by the matched expansion" + ); + let comparisons = metrics.comparisons.load(Ordering::Relaxed); + assert!( + comparisons < num_docs, + "grouped fuzzy AND should not clear the WAND top-k bound and scan every candidate; comparisons={comparisons}, num_docs={num_docs}" + ); + } + #[tokio::test] async fn test_phrase_query_reads_legacy_per_doc_positions() { let tmpdir = TempObjDir::default(); diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index dc6d2a860fb..d693bef7877 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -448,6 +448,9 @@ pub enum CandidateAddr { #[derive(Debug)] pub struct DocCandidate { pub addr: CandidateAddr, + /// The document key used by the posting lists: doc_id for compressed + /// postings, row_id for legacy plain postings. + pub posting_doc_id: u64, /// (term_index, freq) pub freqs: Vec<(u32, u32)>, pub doc_length: u32, @@ -726,6 +729,7 @@ impl<'a, S: Scorer> Wand<'a, S> { // Either a real row_id (so we can run the mask check // inline) or the doc_id widened to u64 (deferred path; // the outer caller will resolve it post-wand). + let posting_doc_id = doc.doc_id(); let row_id = match &doc { DocInfo::Raw(doc) => { if docs_has_row_ids { @@ -778,7 +782,12 @@ impl<'a, S: Scorer> Wand<'a, S> { if candidates.len() < limit { let freqs = self.iter_term_freqs().collect(); - candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); + candidates.push(Reverse(( + ScoredDoc::new(row_id, score), + freqs, + doc_length, + posting_doc_id, + ))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); @@ -786,7 +795,12 @@ impl<'a, S: Scorer> Wand<'a, S> { } else if score > candidates.peek().unwrap().0.0.score.0 { let freqs = self.iter_term_freqs().collect(); candidates.pop(); - candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); + candidates.push(Reverse(( + ScoredDoc::new(row_id, score), + freqs, + doc_length, + posting_doc_id, + ))); let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } @@ -809,11 +823,14 @@ impl<'a, S: Scorer> Wand<'a, S> { }; Ok(candidates .into_iter() - .map(|Reverse((doc, freqs, doc_length))| DocCandidate { - addr: to_addr(doc.row_id), - freqs, - doc_length, - }) + .map( + |Reverse((doc, freqs, doc_length, posting_doc_id))| DocCandidate { + addr: to_addr(doc.row_id), + posting_doc_id, + freqs, + doc_length, + }, + ) .collect()) } @@ -903,7 +920,12 @@ impl<'a, S: Scorer> Wand<'a, S> { if candidates.len() < limit { let freqs = self.iter_term_freqs().collect(); - candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); + candidates.push(Reverse(( + ScoredDoc::new(row_id, score), + freqs, + doc_length, + doc_id, + ))); if candidates.len() == limit { let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); @@ -911,7 +933,12 @@ impl<'a, S: Scorer> Wand<'a, S> { } else if score > candidates.peek().unwrap().0.0.score.0 { let freqs = self.iter_term_freqs().collect(); candidates.pop(); - candidates.push(Reverse((ScoredDoc::new(row_id, score), freqs, doc_length))); + candidates.push(Reverse(( + ScoredDoc::new(row_id, score), + freqs, + doc_length, + doc_id, + ))); let kth = candidates.peek().unwrap().0.0.score.0; self.update_threshold(kth, params.wand_factor); } @@ -924,11 +951,14 @@ impl<'a, S: Scorer> Wand<'a, S> { // every candidate already has a real row_id. Ok(candidates .into_iter() - .map(|Reverse((doc, freqs, doc_length))| DocCandidate { - addr: CandidateAddr::RowId(doc.row_id), - freqs, - doc_length, - }) + .map( + |Reverse((doc, freqs, doc_length, posting_doc_id))| DocCandidate { + addr: CandidateAddr::RowId(doc.row_id), + posting_doc_id, + freqs, + doc_length, + }, + ) .collect()) } From a13a760f8a901143a843c292474866cd55175cc7 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 23 Jun 2026 00:55:24 +0800 Subject: [PATCH 160/177] perf: speed up ICU FTS index builds by 11% (#7393) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This improves ICU FTS index build throughput without changing the produced index behavior. The main cost in the 1M ICU build was allocation churn in token normalization and term registration. This keeps Unicode normalization output unchanged while avoiding unnecessary token string moves in the inverted-index builder and unnecessary Unicode lowercase rewrites for tokens that are already lowercase-stable. Performance comparison, release build with no profiler, 1M rows, ICU tokenizer, 8 workers, memory_limit=16384: | run | build | total | tokenize 100% | |---|---:|---:|---:| | baseline | 268.472s | 268.632s | 251.142s | | this PR | 238.698s | 239.205s | 223.668s | Compared with baseline, this saves 29.774s on build time, a 11.09% reduction. The benchmark run verified the rebuilt FTS index on 1,000,000 rows and returned results for both `the` and `中国` queries. Local validation passed with Rust formatting, full clippy, tokenizer tests, and focused inverted-index tests. --- .../src/scalar/inverted/builder.rs | 9 ++-- .../src/ascii_folding_filter.rs | 14 ++++-- rust/lance-tokenizer/src/lower_caser.rs | 47 ++++++++++++++++--- 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 93932f35332..17cb18c5e96 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -1319,9 +1319,8 @@ impl IndexWorker { let mut token_stream = self.tokenizer.token_stream_for_doc(doc); while token_stream.advance() { - let token = token_stream.token_mut(); - let token_text = std::mem::take(&mut token.text); - let token_id = builder.tokens.add(token_text); + let token = token_stream.token(); + let token_id = builder.tokens.get_or_add(&token.text); if token_id as usize == builder.posting_lists.len() { let old_posting_lists_overhead_size = (builder.posting_lists.capacity() * std::mem::size_of::()) @@ -1360,9 +1359,7 @@ impl IndexWorker { let mut token_stream = self.tokenizer.token_stream_for_doc(doc); while token_stream.advance() { - let token = token_stream.token_mut(); - let token_text = std::mem::take(&mut token.text); - let token_id = self.builder.tokens.add(token_text); + let token_id = self.builder.tokens.get_or_add(&token_stream.token().text); self.token_ids.push(token_id); token_num += 1; } diff --git a/rust/lance-tokenizer/src/ascii_folding_filter.rs b/rust/lance-tokenizer/src/ascii_folding_filter.rs index 376c0e1ebdb..8800545f1fb 100644 --- a/rust/lance-tokenizer/src/ascii_folding_filter.rs +++ b/rust/lance-tokenizer/src/ascii_folding_filter.rs @@ -49,9 +49,10 @@ impl TokenStream for AsciiFoldingFilterTokenStream<'_, T> { if !self.tail.advance() { return false; } - if !self.token_mut().text.is_ascii() { - to_ascii(&self.tail.token().text, self.buffer); - mem::swap(&mut self.tail.token_mut().text, self.buffer); + let token = self.tail.token_mut(); + if !token.text.is_ascii() { + to_ascii(&token.text, self.buffer); + mem::swap(&mut token.text, self.buffer); } true } @@ -67,6 +68,7 @@ impl TokenStream for AsciiFoldingFilterTokenStream<'_, T> { fn to_ascii(text: &str, output: &mut String) { output.clear(); + output.reserve(text.len()); for ch in text.chars() { if ch.is_ascii() { output.push(ch); @@ -149,4 +151,10 @@ mod tests { let tokens = collect_tokens("straße"); assert_eq!(tokens[0].text, "strasse"); } + + #[test] + fn test_ascii_folding_cjk_unchanged() { + let tokens = collect_tokens("こんにちは世界"); + assert_eq!(tokens[0].text, "こんにちは世界"); + } } diff --git a/rust/lance-tokenizer/src/lower_caser.rs b/rust/lance-tokenizer/src/lower_caser.rs index a041ac04e1f..3ad430f2f5a 100644 --- a/rust/lance-tokenizer/src/lower_caser.rs +++ b/rust/lance-tokenizer/src/lower_caser.rs @@ -47,22 +47,30 @@ pub struct LowerCaserTokenStream<'a, T> { fn to_lowercase_unicode(text: &str, output: &mut String) { output.clear(); - output.reserve(50); + output.reserve(text.len()); for ch in text.chars() { output.extend(ch.to_lowercase()); } } +fn is_lowercase_stable(text: &str) -> bool { + text.chars().all(|ch| { + let mut lower = ch.to_lowercase(); + lower.next() == Some(ch) && lower.next().is_none() + }) +} + impl TokenStream for LowerCaserTokenStream<'_, T> { fn advance(&mut self) -> bool { if !self.tail.advance() { return false; } - if self.token_mut().text.is_ascii() { - self.token_mut().text.make_ascii_lowercase(); - } else { - to_lowercase_unicode(&self.tail.token().text, self.buffer); - mem::swap(&mut self.tail.token_mut().text, self.buffer); + let token = self.tail.token_mut(); + if token.text.is_ascii() { + token.text.make_ascii_lowercase(); + } else if !is_lowercase_stable(&token.text) { + to_lowercase_unicode(&token.text, self.buffer); + mem::swap(&mut token.text, self.buffer); } true } @@ -75,3 +83,30 @@ impl TokenStream for LowerCaserTokenStream<'_, T> { self.tail.token_mut() } } + +#[cfg(test)] +mod tests { + use crate::{LowerCaser, RawTokenizer, TextAnalyzer, Token}; + + fn collect_tokens(text: &str) -> Vec { + let mut analyzer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(LowerCaser) + .build(); + let mut stream = analyzer.token_stream(text); + let mut tokens = Vec::new(); + stream.process(&mut |token| tokens.push(token.clone())); + tokens + } + + #[test] + fn test_lower_caser_unicode_changed() { + let tokens = collect_tokens("İSTANBUL"); + assert_eq!(tokens[0].text, "i\u{307}stanbul"); + } + + #[test] + fn test_lower_caser_unicode_unchanged() { + let tokens = collect_tokens("こんにちは世界"); + assert_eq!(tokens[0].text, "こんにちは世界"); + } +} From bccd5a2ccb615c322ff56fb3d81726b48d7b5590 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Mon, 22 Jun 2026 10:29:45 -0700 Subject: [PATCH 161/177] feat: support COUNT(*) pushdown on stable row id datasets (#7360) `COUNT(*)` pushdown (`count_pushdown` -> `CountFromMaskExec`) was disabled on datasets using stable row ids, so those counts fell back to a full scan. The fast path intersects the scalar-index prefilter and the deletion mask (both in stable-id space) with a fragments-allow universe -- but that universe was built in row-address space. ANDing across the two id spaces dropped rows in fragments > 0, so the rule was gated off entirely under stable row ids. The fix builds the universe in stable-id space via the live-id deletion mask (restricted to the covered fragments). An unfiltered count has no prefilter, so the universe is never materialized -- the answer comes straight from fragment metadata. Benchmark (synthetic: 5M rows, 50 fragments, ~1% scattered deletions, BTree on the filter column), `cargo bench -p lance --bench count_pushdown`, this branch vs `main`: | benchmark | before | after | speedup | |---|---|---|---| | `count_unfiltered` | 44.5 ms | 69 us | ~640x | | `count_filtered_1pct` | 1.53 ms | 258 us | ~5.9x | | `count_filtered_50pct` | 28.3 ms | 5.5 ms | ~5.1x | --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance/Cargo.toml | 4 + rust/lance/benches/count_pushdown.rs | 128 ++++++++++++++ .../src/dataset/tests/dataset_aggregate.rs | 60 +++++++ rust/lance/src/io/exec/count_from_mask.rs | 161 +++++++++++++----- rust/lance/src/io/exec/count_pushdown.rs | 74 +++++--- 5 files changed, 368 insertions(+), 59 deletions(-) create mode 100644 rust/lance/benches/count_pushdown.rs diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 6586c928de7..100aa42ea20 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -187,6 +187,10 @@ harness = false name = "scan" harness = false +[[bench]] +name = "count_pushdown" +harness = false + [[bench]] name = "vector_index" harness = false diff --git a/rust/lance/benches/count_pushdown.rs b/rust/lance/benches/count_pushdown.rs new file mode 100644 index 00000000000..4f633d489dc --- /dev/null +++ b/rust/lance/benches/count_pushdown.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Benchmarks for `COUNT(*)` via the scanner aggregate plan (the path the +//! `count_pushdown` rule rewrites into `CountFromMaskExec`). +//! +//! The dataset uses stable row ids, multiple fragments, and scattered +//! cross-fragment deletions, with a BTree scalar index on the filter column. +//! Run on two revisions to compare (e.g. before/after a change to the rule): +//! +//! ```text +//! cargo bench -p lance --bench count_pushdown +//! ``` + +use std::sync::Arc; + +use arrow_array::types::UInt32Type; +use criterion::{Criterion, criterion_group, criterion_main}; +use lance::Dataset; +use lance::dataset::WriteParams; +use lance::index::DatasetIndexExt; +use lance_core::utils::tempfile::TempStrDir; +use lance_datagen::{BatchCount, RowCount, array, gen_batch}; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; +#[cfg(target_os = "linux")] +use lance_testing::pprof::{Output, PProfProfiler}; + +const ROWS_PER_FRAGMENT: usize = 100_000; +const NUM_FRAGMENTS: usize = 50; +const TOTAL_ROWS: u32 = (ROWS_PER_FRAGMENT * NUM_FRAGMENTS) as u32; // 5,000,000 + +struct Fixture { + _datadir: TempStrDir, + dataset: Arc, +} + +impl Fixture { + async fn open() -> Self { + let datadir = TempStrDir::default(); + // `value` steps 0..TOTAL_ROWS, so `value < k` selects exactly k rows + // (before deletions) and gives precise control over selectivity. + let reader = gen_batch() + .col("value", array::step::()) + .into_reader_rows( + RowCount::from(ROWS_PER_FRAGMENT as u64), + BatchCount::from(NUM_FRAGMENTS as u32), + ); + let mut dataset = Dataset::write( + reader, + datadir.as_str(), + Some(WriteParams { + max_rows_per_file: ROWS_PER_FRAGMENT, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // Scatter deletions across every fragment (~1%) to exercise the + // deletion mask in stable-id space. + dataset.delete("value % 100 = 0").await.unwrap(); + + dataset + .create_index( + &["value"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + Self { + _datadir: datadir, + dataset: Arc::new(dataset), + } + } +} + +async fn count_unfiltered(dataset: &Dataset) -> u64 { + dataset.scan().count_rows().await.unwrap() +} + +async fn count_filtered(dataset: &Dataset, filter: &str) -> u64 { + let mut scanner = dataset.scan(); + scanner.filter(filter).unwrap(); + scanner.count_rows().await.unwrap() +} + +fn bench_count(c: &mut Criterion) { + let rt = tokio::runtime::Runtime::new().unwrap(); + let fixture = rt.block_on(Fixture::open()); + let ds = &fixture.dataset; + + c.bench_function("count_unfiltered", |b| { + b.iter(|| rt.block_on(count_unfiltered(ds))) + }); + + // ~1% of rows match. + let filter_1pct = format!("value < {}", TOTAL_ROWS / 100); + c.bench_function("count_filtered_1pct", |b| { + b.iter(|| rt.block_on(count_filtered(ds, &filter_1pct))) + }); + + // ~50% of rows match. + let filter_50pct = format!("value < {}", TOTAL_ROWS / 2); + c.bench_function("count_filtered_50pct", |b| { + b.iter(|| rt.block_on(count_filtered(ds, &filter_50pct))) + }); +} + +#[cfg(target_os = "linux")] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10) + .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None))); + targets = bench_count); + +#[cfg(not(target_os = "linux"))] +criterion_group!( + name = benches; + config = Criterion::default().significance_level(0.1).sample_size(10); + targets = bench_count); + +criterion_main!(benches); diff --git a/rust/lance/src/dataset/tests/dataset_aggregate.rs b/rust/lance/src/dataset/tests/dataset_aggregate.rs index d10a3e42769..8d45cda98e2 100644 --- a/rust/lance/src/dataset/tests/dataset_aggregate.rs +++ b/rust/lance/src/dataset/tests/dataset_aggregate.rs @@ -1301,6 +1301,66 @@ async fn test_scanner_count_rows_with_indexed_filter() { ); } +#[tokio::test] +async fn test_scanner_count_rows_with_indexed_filter_stable_row_ids() { + // Indexed-filter count under stable row ids, with deletions in both + // fragments. The rule fires and the cross-fragment count stays correct. + let tmp = tempdir().unwrap(); + let uri = tmp.path().to_str().unwrap(); + let mut ds = gen_batch() + .col("x", array::step::()) + .col("y", array::step_custom::(0, 2)) + .col("category", array::cycle::(vec![1, 2, 3])) + .into_dataset_with_params( + uri, + FragmentCount::from(2), + FragmentRowCount::from(50), + Some(crate::dataset::WriteParams { + max_rows_per_file: 50, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + ds.create_index( + &["x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + // Delete one row from each fragment (x=10 in frag 0, x=70 in frag 1). + ds.delete("x = 10 OR x = 70").await.unwrap(); + + let mut scanner = ds.scan(); + scanner.filter("x < 100").unwrap(); + scanner + .aggregate(AggregateExpr::builder().count_star().build()) + .unwrap(); + let plan = scanner.create_plan().await.unwrap(); + + assert_plan_node_equals( + plan.clone(), + "AggregateExec: mode=Final, gby=[], aggr=[count(Int32(1))] + CountFromMask + ScalarIndexQuery: query=[x < 100]@x_idx(BTree)", + ) + .await + .unwrap(); + + let stream = execute_plan(plan, LanceExecutionOptions::default()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + // 100 rows match `x < 100`, minus the two deletions. + assert_eq!( + batches[0].column(0).as_primitive::().value(0), + 98, + ); +} + #[tokio::test] async fn test_scanner_count_rows_with_partial_index_coverage() { // Index covers the first two fragments, then a third fragment is diff --git a/rust/lance/src/io/exec/count_from_mask.rs b/rust/lance/src/io/exec/count_from_mask.rs index df0478ce208..0b7aeb11111 100644 --- a/rust/lance/src/io/exec/count_from_mask.rs +++ b/rust/lance/src/io/exec/count_from_mask.rs @@ -31,7 +31,7 @@ use datafusion::physical_plan::{ }; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_expr::aggregate::AggregateFunctionExpr; -use futures::{StreamExt, TryStreamExt}; +use futures::{StreamExt, TryStreamExt, stream}; use lance_core::{Error, Result}; use lance_select::{RowAddrMask, RowAddrSelection, RowAddrTreeMap}; use lance_table::format::Fragment; @@ -40,6 +40,7 @@ use tracing::instrument; use super::utils::InstrumentedRecordBatchStreamAdapter; use crate::Dataset; +use crate::dataset::rowids::load_row_id_sequences; use crate::index::prefilter::DatasetPreFilter; /// An execution node that computes a `COUNT(*)`-style aggregate from an @@ -234,36 +235,16 @@ impl CountFromMaskExec { Ok(count) } - #[instrument(name = "count_from_mask", skip_all, level = "debug")] - async fn do_execute( - dataset: Arc, - aggregate_funcs_len: usize, - prefilter_input: Option>, - restrict_to_fragments: Option, - context: Arc, - schema: SchemaRef, - ) -> Result { - let prefilter = match prefilter_input { - None => None, - Some(input) => Some(Self::load_prefilter(input, context.clone()).await?), - }; - - // Anchor the deletion mask against either every dataset fragment or - // the caller-supplied restricted subset. - let dataset_fragments: RoaringBitmap = - dataset.fragments().iter().map(|f| f.id as u32).collect(); - let fragments_covered = match restrict_to_fragments { - Some(restrict) => dataset_fragments & restrict, - None => dataset_fragments, - }; - - // Build the fragments allow list as concrete `[0..physical_rows)` - // ranges rather than `Full` markers. `Full` interacts poorly with - // `BlockList` subtraction — `RowAddrTreeMap::Sub` materializes a - // `RoaringBitmap::full()` (2^32 rows) per fragment when a `Full` - // entry gets a partial block subtracted from it, which inflates - // counts and is expensive. Concrete ranges avoid that path entirely - // and keep `len()` exact at every combine step. + /// Row-address-space fragments-allow list: concrete `[0..physical_rows)` + /// ranges per covered fragment. + /// + /// Concrete ranges, not `Full` markers: subtracting a `BlockList` from a + /// `Full` entry materializes a `RoaringBitmap::full()` (2^32) per fragment, + /// which is slow and throws off `len()`. + fn address_fragments_allow( + dataset: &Dataset, + fragments_covered: &RoaringBitmap, + ) -> Result { let frag_map: HashMap = dataset .fragments() .iter() @@ -287,16 +268,118 @@ impl CountFromMaskExec { bitmap.insert_range(0u32..(physical as u32)); fragments_allow.insert_bitmap(frag_id, bitmap); } + Ok(fragments_allow) + } + + /// Live (non-deleted) row count of the covered fragments, from fragment + /// metadata. Used for an unfiltered count: no prefilter to intersect, so no + /// need to build the stable-id universe. + async fn count_live_rows(dataset: &Dataset, fragments_covered: &RoaringBitmap) -> Result { + let frags = dataset + .get_fragments() + .into_iter() + .filter(|f| fragments_covered.contains(f.id() as u32)); + let counts = stream::iter(frags) + .map(|f| async move { f.count_rows(None).await }) + .buffer_unordered(dataset.object_store.as_ref().io_parallelism()) + .try_collect::>() + .await?; + Ok(counts.iter().sum::() as i64) + } + + /// Count universe in stable-id space: live stable row ids whose current home + /// is in `fragments_covered`. Staying in stable-id space lets it intersect + /// the index prefilter directly; deletions are already folded in, so the + /// caller passes no separate deletion mask. + async fn stable_id_universe( + dataset: &Arc, + fragments_covered: RoaringBitmap, + ) -> Result { + // create_restricted_deletion_mask gives a live-id allow list restricted + // to `fragments_covered`. It returns None only with no deletions and full + // coverage — then the universe is every stable id, loaded below. + if let Some(fut) = DatasetPreFilter::create_restricted_deletion_mask( + dataset.clone(), + fragments_covered.clone(), + ) { + let mask = fut.await?; + return mask.allow_list().cloned().ok_or_else(|| { + Error::internal( + "CountFromMaskExec: stable-row-id deletion mask must be an AllowList" + .to_string(), + ) + }); + } + Self::load_stable_id_universe(dataset, &fragments_covered).await + } + + /// Every stable row id in the covered fragments, from their row-id sequences + /// (metadata, not column data). Only used with no deletions and full coverage. + async fn load_stable_id_universe( + dataset: &Dataset, + fragments_covered: &RoaringBitmap, + ) -> Result { + let frags: Vec = dataset + .fragments() + .iter() + .filter(|f| fragments_covered.contains(f.id as u32)) + .cloned() + .collect(); + let mut sequences = load_row_id_sequences(dataset, &frags); + let mut universe = RowAddrTreeMap::new(); + while let Some((_frag_id, sequence)) = sequences.try_next().await? { + universe |= RowAddrTreeMap::from(sequence.as_ref()); + } + Ok(universe) + } + + #[instrument(name = "count_from_mask", skip_all, level = "debug")] + async fn do_execute( + dataset: Arc, + aggregate_funcs_len: usize, + prefilter_input: Option>, + restrict_to_fragments: Option, + context: Arc, + schema: SchemaRef, + ) -> Result { + let prefilter = match prefilter_input { + None => None, + Some(input) => Some(Self::load_prefilter(input, context.clone()).await?), + }; - // Load the deletion mask for the covered fragments. - let deletion_mask = - match DatasetPreFilter::create_deletion_mask(dataset.clone(), fragments_covered) { - Some(fut) => Some(fut.await?), - None => None, - }; + // Anchor the deletion mask against either every dataset fragment or + // the caller-supplied restricted subset. + let dataset_fragments: RoaringBitmap = + dataset.fragments().iter().map(|f| f.id as u32).collect(); + let fragments_covered = match restrict_to_fragments { + Some(restrict) => dataset_fragments & restrict, + None => dataset_fragments, + }; - let combined = Self::combine_masks(fragments_allow, prefilter, deletion_mask); - let count = Self::count_from_mask(&combined, dataset.as_ref())?; + // Under stable row ids the prefilter and deletion masks are in stable-id + // space, so the universe must be too (see `stable_id_universe`); the + // default path builds it in row-address space. + let count = if dataset.manifest.uses_stable_row_ids() { + match prefilter { + // No prefilter: just the live row count, from metadata. + None => Self::count_live_rows(&dataset, &fragments_covered).await?, + Some(prefilter) => { + let universe = Self::stable_id_universe(&dataset, fragments_covered).await?; + let combined = Self::combine_masks(universe, Some(prefilter), None); + Self::count_from_mask(&combined, dataset.as_ref())? + } + } + } else { + let fragments_allow = Self::address_fragments_allow(&dataset, &fragments_covered)?; + // Load the deletion mask for the covered fragments. + let deletion_mask = + match DatasetPreFilter::create_deletion_mask(dataset.clone(), fragments_covered) { + Some(fut) => Some(fut.await?), + None => None, + }; + let combined = Self::combine_masks(fragments_allow, prefilter, deletion_mask); + Self::count_from_mask(&combined, dataset.as_ref())? + }; // Every aggregate is the same non-distinct COUNT shape — emit the // count once per output column. diff --git a/rust/lance/src/io/exec/count_pushdown.rs b/rust/lance/src/io/exec/count_pushdown.rs index 3a3f442aa3e..d5d90b5881a 100644 --- a/rust/lance/src/io/exec/count_pushdown.rs +++ b/rust/lance/src/io/exec/count_pushdown.rs @@ -146,23 +146,6 @@ fn try_rewrite(agg: &AggregateExec) -> DFResult>> return Ok(None); }; - // Stable-row-id mode: `DatasetPreFilter::create_deletion_mask` produces - // an AllowList in stable-id space, but `CountFromMaskExec` builds its - // fragments-allow list in row-address space. ANDing across the two - // yields a silently wrong count (rows in fragments > 0 are dropped - // because their stable ids and row addresses share a fragment-id bucket - // only by accident). Until the exec can reconcile the two id spaces, - // refuse to fire — but warn so we notice the lost optimization - // opportunity. - if filtered_read.dataset().manifest().uses_stable_row_ids() { - warn!( - "count_pushdown: skipped because the dataset uses stable row ids; \ - the count will be computed via a full scan. Reconciling the two id spaces \ - would let this query be answered from index metadata." - ); - return Ok(None); - } - let options = filtered_read.options(); // A refine filter is a residual the index couldn't fully evaluate — it // needs column data to apply, which we can't. @@ -668,7 +651,8 @@ mod tests { } #[tokio::test] - async fn rule_skips_with_stable_row_ids() { + async fn rule_fires_with_stable_row_ids() { + // Unfiltered count, stable row ids, with a deletion. use crate::dataset::WriteParams; let tmp = TempStrDir::default(); let mut dataset = gen_batch() @@ -692,8 +676,58 @@ mod tests { let (plan, count) = run_count(&mut scanner).await; assert_eq!(count, 19); assert!( - !plan_contains_pushdown(&plan), - "rule must not fire under stable row IDs, got plan: {}", + plan_contains_pushdown(&plan), + "rule should fire under stable row IDs, got plan: {}", + displayable(plan.as_ref()).indent(true) + ); + } + + #[tokio::test] + async fn rule_fires_with_stable_row_ids_and_filter() { + // Indexed filter, stable row ids, deletions spread across fragments -- + // the case the pre-fix code got wrong (dropped rows in fragments > 0). + use crate::dataset::WriteParams; + let tmp = TempStrDir::default(); + let mut dataset = gen_batch() + .col("ordered", lance_datagen::array::step::()) + .into_dataset_with_params( + tmp.as_str(), + FragmentCount::from(3), + FragmentRowCount::from(10), + Some(WriteParams { + max_rows_per_file: 10, + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + dataset + .create_index( + &["ordered"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + // Delete one row from fragment 1 and one from fragment 2. + dataset + .delete("ordered = 15 OR ordered = 25") + .await + .unwrap(); + let dataset = Arc::new(dataset); + + let mut scanner = dataset.scan(); + // Matches every row across all three fragments; with the two deletions + // the live count is 28. + scanner.filter("ordered >= 0").unwrap(); + let (plan, count) = run_count(&mut scanner).await; + assert_eq!(count, 28); + assert!( + plan_contains_pushdown(&plan), + "rule should fire under stable row IDs with a filter, got plan: {}", displayable(plan.as_ref()).indent(true) ); } From b022a995dbe5e1345f587f41868006f97ec55a4b Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Mon, 22 Jun 2026 18:17:48 +0000 Subject: [PATCH 162/177] chore: bump to 9.0.0-beta.1 based on breaking change detection --- .bumpversion.toml | 2 +- Cargo.lock | 48 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 41 +++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 40 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 91 insertions(+), 90 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 80668862afb..7c9e5196b3f 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "8.1.0-beta.0" +current_version = "9.0.0-beta.1" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 11a5fb65a7b..325c8dca20c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4380,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "all_asserts", "approx", @@ -4483,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4531,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrayref", "paste", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4580,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4613,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4632,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "proc-macro2", "quote", @@ -4641,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -4686,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "all_asserts", "arrow", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "datafusion", "geo-traits", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "approx", "arc-swap", @@ -4842,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-arith", @@ -4890,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "approx", "arrow-array", @@ -4910,7 +4910,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "async-trait", @@ -4922,7 +4922,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-schema", @@ -4938,7 +4938,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -5002,7 +5002,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -5020,7 +5020,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -5066,7 +5066,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "proc-macro2", "quote", @@ -5075,7 +5075,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-schema", @@ -5088,7 +5088,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index f902f10496b..508c950fc6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=8.1.0-beta.0", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=8.1.0-beta.0", path = "./rust/lance-arrow" } -lance-core = { version = "=8.1.0-beta.0", path = "./rust/lance-core" } -lance-datafusion = { version = "=8.1.0-beta.0", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=8.1.0-beta.0", path = "./rust/lance-datagen" } -lance-derive = { version = "=8.1.0-beta.0", path = "./rust/lance-derive" } -lance-encoding = { version = "=8.1.0-beta.0", path = "./rust/lance-encoding" } -lance-file = { version = "=8.1.0-beta.0", path = "./rust/lance-file" } -lance-geo = { version = "=8.1.0-beta.0", path = "./rust/lance-geo" } -lance-index = { version = "=8.1.0-beta.0", path = "./rust/lance-index" } -lance-io = { version = "=8.1.0-beta.0", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=8.1.0-beta.0", path = "./rust/lance-linalg" } -lance-namespace = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=8.1.0-beta.0", path = "./rust/lance-namespace-impls" } +lance = { version = "=9.0.0-beta.1", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=9.0.0-beta.1", path = "./rust/lance-arrow" } +lance-core = { version = "=9.0.0-beta.1", path = "./rust/lance-core" } +lance-datafusion = { version = "=9.0.0-beta.1", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=9.0.0-beta.1", path = "./rust/lance-datagen" } +lance-derive = { version = "=9.0.0-beta.1", path = "./rust/lance-derive" } +lance-encoding = { version = "=9.0.0-beta.1", path = "./rust/lance-encoding" } +lance-file = { version = "=9.0.0-beta.1", path = "./rust/lance-file" } +lance-geo = { version = "=9.0.0-beta.1", path = "./rust/lance-geo" } +lance-index = { version = "=9.0.0-beta.1", path = "./rust/lance-index" } +lance-io = { version = "=9.0.0-beta.1", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=9.0.0-beta.1", path = "./rust/lance-linalg" } +lance-namespace = { version = "=9.0.0-beta.1", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=9.0.0-beta.1", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=8.1.0-beta.0", path = "./rust/lance-select" } -lance-tokenizer = { version = "=8.1.0-beta.0", path = "./rust/lance-tokenizer" } -lance-table = { version = "=8.1.0-beta.0", path = "./rust/lance-table" } -lance-test-macros = { version = "=8.1.0-beta.0", path = "./rust/lance-test-macros" } -lance-testing = { version = "=8.1.0-beta.0", path = "./rust/lance-testing" } +lance-select = { version = "=9.0.0-beta.1", path = "./rust/lance-select" } +lance-tokenizer = { version = "=9.0.0-beta.1", path = "./rust/lance-tokenizer" } +lance-table = { version = "=9.0.0-beta.1", path = "./rust/lance-table" } +lance-test-macros = { version = "=9.0.0-beta.1", path = "./rust/lance-test-macros" } +lance-testing = { version = "=9.0.0-beta.1", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=8.1.0-beta.0", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=9.0.0-beta.1", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=8.1.0-beta.0", path = "./rust/compression/fsst" } +fsst = { version = "=9.0.0-beta.1", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index ee52544ba57..2827ef9b19a 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3665,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arc-swap", "arrow", @@ -3738,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -3780,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrayref", "paste", @@ -3789,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -3859,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -3876,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "proc-macro2", "quote", @@ -3885,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -3920,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -3950,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "datafusion", "geo-traits", @@ -3964,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-arith", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4120,11 +4120,12 @@ dependencies = [ "lance-core", "num-traits", "rand 0.9.4", + "rayon", ] [[package]] name = "lance-namespace" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "async-trait", @@ -4136,7 +4137,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-ipc", @@ -4185,7 +4186,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4200,7 +4201,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4237,7 +4238,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 6210c5daf1d..db37d1d31a7 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 6306ecc63f9..9b4e57e4724 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 8.1.0-beta.0 + 9.0.0-beta.1 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index 126714795cc..b1b207e83d7 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4067,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arc-swap", "arrow", @@ -4141,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4183,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrayref", "paste", @@ -4192,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "proc-macro2", "quote", @@ -4288,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-arith", "arrow-array", @@ -4353,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "datafusion", "geo-traits", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arc-swap", "arrow", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-arith", @@ -4477,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4493,7 +4493,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "async-trait", @@ -4505,7 +4505,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-ipc", @@ -4554,7 +4554,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow-array", "arrow-buffer", @@ -4569,7 +4569,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "arrow", "arrow-array", @@ -4608,7 +4608,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6046,7 +6046,7 @@ dependencies = [ [[package]] name = "pylance" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index 240c046e5ff..89a50a652dc 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "8.1.0-beta.0" +version = "9.0.0-beta.1" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 1c3d2017f0f98df0c2bb15fbc3e54e83f6ba2c23 Mon Sep 17 00:00:00 2001 From: Rainie Li Date: Mon, 22 Jun 2026 12:15:55 -0700 Subject: [PATCH 163/177] fix: support manifests >5 GB via size-aware copy (#7047) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #7197. ## Summary - `object_store.copy()` is called unconditionally on the staging→final manifest copy path. This routes to S3's `CopyObject` API, which has a ~5 GB hard cap. Manifests above this fail with `EntityTooLarge` — production case was a ~14 GB manifest with `ProposedSize 14961429442` on `_versions/...manifest`. - Add `copy_size_aware`: keeps the cheap server-side `store.copy()` for sources below the limit, falls back to read+rewrite via a multipart upload for larger sources. The required `size` argument lets the caller skip an extra `head()` round-trip. - The 5 GiB threshold is backend-agnostic, not S3-specific: S3's `CopyObject` and GCS's single-shot `Objects.copy` both cap at ~5 GiB, so the constant is named `MAX_SERVER_SIDE_COPY_BYTES`. Stores without such a cap (e.g. local FS) take the read+rewrite fallback above 5 GiB too; correctness is preserved, only the rare large copy is slower. - Also tighten `MAX_UPLOAD_PART_SIZE` from `5 GiB` to `5 GiB - 1` so `LANCE_INITIAL_UPLOAD_SIZE=5368709120` can't trigger a single PUT of exactly 5 GiB on shutdown — which S3 also rejects. Same bug class as #6750 (multipart-aware put for txn file writes), different code path. ## Test plan New tests in `rust/lance/src/io/commit/external_manifest.rs` covering both the >5 GB read+rewrite fallback and the small-file fast path. Related downstream issue: https://github.com/lance-format/lance-spark/issues/529 Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-io/src/object_writer.rs | 21 +- .../src/io/commit/external_manifest.rs | 141 ++++++++- rust/lance/src/io/commit/external_manifest.rs | 295 +++++++++++++++++- 3 files changed, 445 insertions(+), 12 deletions(-) diff --git a/rust/lance-io/src/object_writer.rs b/rust/lance-io/src/object_writer.rs index 4b9bb901446..0fd0a30f9e7 100644 --- a/rust/lance-io/src/object_writer.rs +++ b/rust/lance-io/src/object_writer.rs @@ -47,8 +47,12 @@ fn max_conn_reset_retries() -> u16 { }) } -/// Maximum part size in GCS and S3: 5GB. -const MAX_UPLOAD_PART_SIZE: usize = 1024 * 1024 * 1024 * 5; +/// Maximum body size for a single S3 PUT: strictly less than 5 GiB. +/// AWS rejects single-PUT bodies of exactly 5 GiB (= 5 * 1024^3) with +/// `EntityTooLarge`, so we clamp `LANCE_INITIAL_UPLOAD_SIZE` one byte +/// below that threshold to keep the buffer-fills-to-clamp single-PUT +/// path safe. See lance#6750 for the related txn-file write fix. +const MAX_UPLOAD_PART_SIZE: usize = 1024 * 1024 * 1024 * 5 - 1; /// Clamps a requested upload part size to the valid [5MB, 5GB] range. /// Returns the clamped value and whether clamping was necessary. @@ -898,4 +902,17 @@ mod tests { (MAX_UPLOAD_PART_SIZE, true) ); } + + /// Regression for the foot-gun where `LANCE_INITIAL_UPLOAD_SIZE=5368709120` + /// (exactly 5 GiB, Pucheng's setting) caused a single-PUT of 5 GiB on + /// shutdown — which S3 rejects with `EntityTooLarge`. After tightening + /// `MAX_UPLOAD_PART_SIZE` to 5 GiB - 1, raw 5 GiB must clamp DOWN. + #[test] + fn clamp_initial_upload_size_at_5gib_clamps_down() { + let exactly_5_gib: usize = 5 * 1024 * 1024 * 1024; + assert_eq!( + clamp_initial_upload_size(exactly_5_gib), + (MAX_UPLOAD_PART_SIZE, true) + ); + } } diff --git a/rust/lance-table/src/io/commit/external_manifest.rs b/rust/lance-table/src/io/commit/external_manifest.rs index a6c9bbaa90d..22ebaa10b4a 100644 --- a/rust/lance-table/src/io/commit/external_manifest.rs +++ b/rust/lance-table/src/io/commit/external_manifest.rs @@ -8,6 +8,8 @@ use std::sync::Arc; use async_trait::async_trait; +use bytes::Bytes; +use futures::StreamExt; use lance_core::utils::tracing::{ AUDIT_MODE_CREATE, AUDIT_MODE_DELETE, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT, }; @@ -123,7 +125,7 @@ pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync { // Step 2: Copy staging to final path let final_path = naming_scheme.manifest_path(base_path, version); - let copied = match object_store.copy(staging_path, &final_path).await { + let copied = match copy_size_aware(object_store, staging_path, &final_path, size).await { Ok(_) => true, Err(ObjectStoreError::NotFound { .. }) => false, Err(e) => return Err(e.into()), @@ -213,6 +215,129 @@ pub(crate) fn detect_naming_scheme_from_path(path: &Path) -> Result5 GiB copy is slower than a native copy would be. +const MAX_SERVER_SIDE_COPY_BYTES: u64 = 5 * 1024 * 1024 * 1024; + +/// Part size for the read+rewrite fallback. Multipart-capable stores +/// (S3, GCS) require every part except the last to be ≥5 MB and allow up to +/// 10,000 parts. 100 MB sits comfortably inside both bounds and keeps the +/// part count low (~140 parts for a 14 GB manifest) without large per-part +/// RAM. +const COPY_REWRITE_PART_SIZE: usize = 100 * 1024 * 1024; + +/// Copy `from` to `to`, falling back to a multipart-equivalent read+rewrite +/// when the source exceeds the server-side-copy size limit +/// (`MAX_SERVER_SIDE_COPY_BYTES`). +/// +/// For sources below the limit, this is the same fast server-side +/// `store.copy()` as before. For larger sources, the source is streamed +/// through the client and re-uploaded as a multipart upload at `to`. This +/// doubles bytes-on-the-wire for the rare large case while preserving the +/// cheap fast path for the common small case. +/// +/// `size` is the known source size. It is required: the only caller already +/// has it, and the alternative (an extra `head(from)` round-trip) is work +/// the caller can avoid by passing what it already knows. +/// +/// `NotFound` errors on `from` propagate unchanged so callers can keep +/// existing `Err(NotFound { .. })` arms. +/// +/// This is a workaround for the missing `UploadPartCopy` primitive in the +/// upstream `object_store` crate. Once that lands, this helper can be +/// deleted and the call sites can go back to plain `store.copy()`. +async fn copy_size_aware( + store: &dyn OSObjectStore, + from: &Path, + to: &Path, + size: u64, +) -> std::result::Result<(), ObjectStoreError> { + if size < MAX_SERVER_SIDE_COPY_BYTES { + store.copy(from, to).await + } else { + copy_via_read_rewrite(store, from, to).await + } +} + +// NOTE: parts are uploaded sequentially. This could be parallelized (a +// bounded JoinSet, like lance-io/src/object_writer.rs's +// LANCE_UPLOAD_CONCURRENCY) or sidestepped entirely by switching to +// `object_store::WriteMultipart` (which also handles abort-on-drop). Left +// sequential here: this is a cold path (only >5 GiB manifests) and the +// helper is itself a stopgap until `object_store` exposes UploadPartCopy. +async fn copy_via_read_rewrite( + store: &dyn OSObjectStore, + from: &Path, + to: &Path, +) -> std::result::Result<(), ObjectStoreError> { + // NotFound here propagates upward unchanged. + let mut stream = store.get(from).await?.into_stream(); + + // From here on, errors must `abort()` the upload to avoid leaving an + // orphan multipart upload on stores that support them (e.g. S3, GCS), + // which would otherwise incur storage charges until the bucket's + // lifecycle policy cleans it up. + // + // Note: this does NOT cover task cancellation — `MultipartUpload`'s + // upstream Drop is documented as a no-op for S3/GCS. Callers that + // need cancellation cleanliness should run this with a guard or + // switch to `object_store::WriteMultipart` (planned follow-up). + let mut upload = store.put_multipart(to).await?; + let mut part_buf: Vec = Vec::with_capacity(COPY_REWRITE_PART_SIZE); + + while let Some(chunk) = stream.next().await { + let chunk = match chunk { + Ok(b) => b, + Err(e) => { + let _ = upload.abort().await; + return Err(e); + } + }; + // Append the chunk in COPY_REWRITE_PART_SIZE-bounded slices so a + // single oversized chunk (e.g., LocalFileSystem returning a whole + // file) cannot push part_buf past the backend's per-part size limit + // (5 GiB on S3/GCS). COPY_REWRITE_PART_SIZE is well under every + // backend's cap, so each flushed part is always valid. + let mut offset = 0; + while offset < chunk.len() { + let want = COPY_REWRITE_PART_SIZE - part_buf.len(); + let take = want.min(chunk.len() - offset); + part_buf.extend_from_slice(&chunk[offset..offset + take]); + offset += take; + + if part_buf.len() >= COPY_REWRITE_PART_SIZE { + let payload = + std::mem::replace(&mut part_buf, Vec::with_capacity(COPY_REWRITE_PART_SIZE)); + if let Err(e) = upload.put_part(Bytes::from(payload).into()).await { + let _ = upload.abort().await; + return Err(e); + } + } + } + } + + // Flush the final (possibly-short) part. The last part of a multipart + // upload is exempt from the per-part minimum on S3/GCS. + if !part_buf.is_empty() + && let Err(e) = upload.put_part(Bytes::from(part_buf).into()).await + { + let _ = upload.abort().await; + return Err(e); + } + + if let Err(e) = upload.complete().await { + let _ = upload.abort().await; + return Err(e); + } + Ok(()) +} + /// External manifest commit handler /// This handler is used to commit a manifest to an external store /// for detailed design, see @@ -245,14 +370,12 @@ impl ExternalManifestCommitHandler { // step 1: copy the manifest to the final location let final_manifest_path = naming_scheme.manifest_path(base_path, version); - let copied = match store - .copy(staging_manifest_path, &final_manifest_path) - .await - { - Ok(_) => true, - Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it. - Err(e) => return Err(e.into()), - }; + let copied = + match copy_size_aware(store, staging_manifest_path, &final_manifest_path, size).await { + Ok(_) => true, + Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it. + Err(e) => return Err(e.into()), + }; if copied { info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_manifest_path.as_ref()); } diff --git a/rust/lance/src/io/commit/external_manifest.rs b/rust/lance/src/io/commit/external_manifest.rs index eee4fbf07b6..850d10f9a23 100644 --- a/rust/lance/src/io/commit/external_manifest.rs +++ b/rust/lance/src/io/commit/external_manifest.rs @@ -4,10 +4,14 @@ /// Keep the tests in `lance` crate because it has dependency on [Dataset]. #[cfg(test)] mod test { + use std::ops::Range; use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; use std::{collections::HashMap, time::Duration}; use async_trait::async_trait; + use bytes::Bytes; + use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt, future::join_all}; use lance_core::{Error, Result}; use lance_table::io::commit::external_manifest::{ @@ -15,7 +19,12 @@ mod test { }; use lance_table::io::commit::{CommitHandler, ManifestNamingScheme}; use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; - use object_store::{ObjectStoreExt, local::LocalFileSystem, path::Path}; + use object_store::memory::InMemory; + use object_store::{ + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, + ObjectStore as OSObjectStore, ObjectStoreExt, PutMultipartOptions, PutOptions, PutPayload, + PutResult, RenameOptions, Result as OSResult, local::LocalFileSystem, path::Path, + }; use tokio::sync::Mutex; use crate::dataset::builder::DatasetBuilder; @@ -420,4 +429,288 @@ mod test { .collect::>(); assert!(unexpected_entries.is_empty(), "{:?}", unexpected_entries); } + + /// S3's `CopyObject` API has a hard 5 GB cap on the source object size. + /// Above that, callers must use multipart copy (`UploadPartCopy`) instead. + /// `lance-table::io::commit::external_manifest` calls + /// `object_store.copy(staging, final)` unconditionally on the manifest + /// commit path — which fails for manifests >5 GB. + /// + /// This wrapper enforces that S3-side cap on top of any inner store, so + /// the regression can be reproduced in-process without S3. + /// + /// It also lets the test override `head().size` for a chosen path, so the + /// staging file can *appear* to be 14 GB without actually putting that + /// many bytes into the inner store. + const S3_COPY_OBJECT_CAP_BYTES: u64 = 5 * 1024 * 1024 * 1024; + + #[derive(Debug)] + struct CopyCapStore { + inner: Arc, + /// path → fake size returned by head(); overrides the inner store. + head_size_overrides: Arc>>, + /// Counts calls to `copy_opts` (the fast path). Tests use this to + /// assert which branch of `copy_size_aware` was taken — succeeding + /// alone is not enough, since the slow path can also succeed for + /// small files. + copy_calls: AtomicUsize, + /// Counts calls to `put_multipart_opts` (the slow read+rewrite path). + put_multipart_calls: AtomicUsize, + } + + impl CopyCapStore { + fn new(inner: Arc) -> Self { + Self { + inner, + head_size_overrides: Arc::new(Mutex::new(HashMap::new())), + copy_calls: AtomicUsize::new(0), + put_multipart_calls: AtomicUsize::new(0), + } + } + + async fn override_size(&self, path: &Path, size: u64) { + self.head_size_overrides + .lock() + .await + .insert(path.to_string(), size); + } + + async fn effective_size(&self, location: &Path, real: u64) -> u64 { + self.head_size_overrides + .lock() + .await + .get(&location.to_string()) + .copied() + .unwrap_or(real) + } + + fn copy_calls(&self) -> usize { + self.copy_calls.load(Ordering::SeqCst) + } + + fn put_multipart_calls(&self) -> usize { + self.put_multipart_calls.load(Ordering::SeqCst) + } + } + + impl std::fmt::Display for CopyCapStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CopyCapStore({})", self.inner) + } + } + + #[async_trait] + impl OSObjectStore for CopyCapStore { + async fn put_opts( + &self, + location: &Path, + bytes: PutPayload, + opts: PutOptions, + ) -> OSResult { + self.inner.put_opts(location, bytes, opts).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> OSResult> { + self.put_multipart_calls.fetch_add(1, Ordering::SeqCst); + self.inner.put_multipart_opts(location, opts).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult { + // `head()` is a default method on `ObjectStore` that delegates to + // `get_opts(location, GetOptions { head: true, .. })`. To make a + // staging file *appear* to be 14 GB without holding 14 GB in + // memory, we override the size in the returned ObjectMeta here. + let mut res = self.inner.get_opts(location, options).await?; + let overridden = self.effective_size(location, res.meta.size).await; + res.meta.size = overridden; + Ok(res) + } + + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> OSResult> { + self.inner.get_ranges(location, ranges).await + } + + // `head` and `delete` are default methods on `ObjectStore`, derived + // from `get_opts`/`delete_stream`. We override `head` indirectly by + // overriding `get_opts` below — it returns size based on the + // overrides table for the chosen path. + fn delete_stream( + &self, + locations: BoxStream<'static, OSResult>, + ) -> BoxStream<'static, OSResult> { + self.inner.delete_stream(locations) + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, OSResult> { + self.inner.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, OSResult> { + self.inner.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> OSResult { + self.inner.list_with_delimiter(prefix).await + } + + async fn copy_opts(&self, from: &Path, to: &Path, opts: CopyOptions) -> OSResult<()> { + // Mimic S3's CopyObject 5 GB hard cap: read the (possibly-overridden) + // size of the source via head() and reject if it crosses the cap. + let meta = self.head(from).await?; + if meta.size >= S3_COPY_OBJECT_CAP_BYTES { + return Err(object_store::Error::Generic { + store: "S3", + source: format!( + "EntityTooLarge: ProposedSize {} exceeds CopyObject 5GB cap", + meta.size + ) + .into(), + }); + } + self.copy_calls.fetch_add(1, Ordering::SeqCst); + self.inner.copy_opts(from, to, opts).await + } + + async fn rename_opts(&self, from: &Path, to: &Path, opts: RenameOptions) -> OSResult<()> { + self.inner.rename_opts(from, to, opts).await + } + } + + /// Repro for the manifest >5 GB bug. + /// + /// Drives `ExternalManifestStore::put` (the default impl) against a + /// staging file whose `head().size` is reported as 14 GB. That `put` + /// calls `object_store.copy(staging, final)` unconditionally — which + /// our `CopyCapStore` wrapper rejects with the same `EntityTooLarge` + /// error S3 returns in production. + /// + /// Today this test is RED: the copy step fails on >5 GB. + /// After `copy_size_aware` lands, it should turn GREEN by falling back + /// to a multipart-equivalent path (option 1: read+rewrite via + /// `ObjectWriter`). + #[tokio::test] + async fn manifest_commit_succeeds_when_staging_exceeds_5gb_copy_cap() { + let inner: Arc = Arc::new(InMemory::new()); + let capped = Arc::new(CopyCapStore::new(inner)); + + // Write a small staging file, then lie about its size so the + // CopyObject cap fires without holding 14 GB in memory. + let base_path = Path::from("repro"); + let staging_path = Path::from("repro/_versions/1.manifest.staging-abcd"); + let body = b"fake manifest body"; + capped + .put(&staging_path, PutPayload::from_static(body)) + .await + .expect("seed staging file"); + capped + .override_size(&staging_path, 14_961_429_442) // matches the production failure + .await; + + // Spin up an ExternalManifestStore and drive `put` (the same code + // path the failing CTAS hits via ExternalManifestCommitHandler). + let external = SleepyExternalManifestStore::new(); + let head_meta = capped.head(&staging_path).await.unwrap(); + + let location = external + .put( + &base_path, + 1, + &staging_path, + head_meta.size, + head_meta.e_tag, + capped.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .expect( + "manifest commit should succeed for a >5 GB staging file via multipart-aware copy", + ); + + // Branch-taken assertions: the slow read+rewrite path was used. + assert_eq!( + capped.copy_calls(), + 0, + "CopyObject must not be attempted for >5 GiB sources" + ); + assert!( + capped.put_multipart_calls() >= 1, + "read+rewrite path must initiate a multipart upload" + ); + + // End-state assertions: final manifest exists with the original + // bytes, and the staging file was deleted. + let final_get = capped + .inner + .get(&location.path) + .await + .expect("final manifest must exist on the inner store") + .bytes() + .await + .unwrap(); + assert_eq!(final_get.as_ref(), body); + let staging_after = capped.inner.head(&staging_path).await; + assert!( + matches!(staging_after, Err(object_store::Error::NotFound { .. })), + "staging file must be cleaned up after commit, got: {:?}", + staging_after + ); + } + + /// Counterpart to manifest_commit_succeeds_when_staging_exceeds_5gb_copy_cap. + /// Confirms that for staging files BELOW the 5 GB cap, the fast-path + /// server-side `copy()` is still used — i.e. we haven't accidentally + /// regressed every commit to read+rewrite. + #[tokio::test] + async fn manifest_commit_uses_fast_copy_for_small_staging() { + let inner: Arc = Arc::new(InMemory::new()); + let capped = Arc::new(CopyCapStore::new(inner)); + + let base_path = Path::from("repro"); + let staging_path = Path::from("repro/_versions/1.manifest.staging-abcd"); + capped + .put( + &staging_path, + PutPayload::from_static(b"small manifest body"), + ) + .await + .expect("seed staging file"); + // No size override — the staging file's real size is ~20 bytes, + // well below the 5 GB cap, so copy_size_aware must take the fast + // path. + + let external = SleepyExternalManifestStore::new(); + let head_meta = capped.head(&staging_path).await.unwrap(); + + external + .put( + &base_path, + 1, + &staging_path, + head_meta.size, + head_meta.e_tag, + capped.as_ref(), + ManifestNamingScheme::V2, + ) + .await + .expect("small manifest commit must succeed via fast-path copy"); + + // The branch-taken assertion: fast path was used, slow path was not. + assert!( + capped.copy_calls() >= 1, + "small-file commit must use server-side CopyObject" + ); + assert_eq!( + capped.put_multipart_calls(), + 0, + "small-file commit must NOT initiate a multipart upload" + ); + } } From f4d24ca30b34a4fb206736cb51634a94ec3cd294 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Mon, 22 Jun 2026 12:15:58 -0700 Subject: [PATCH 164/177] fix: reject DataReplacement racing concurrent Update/Delete/Merge (#7373) A DataReplacement rewrites a column's data file positionally against the fragments it targets. The conflict resolver returned Ok unconditionally for a concurrent Update, Delete, or Merge, so a DataReplacement committed at a read version those operations had superseded was applied silently -- dropping or misaligning the rows the concurrent op moved or deleted, with no error raised. Merge was additionally asymmetric: check_merge_txn already treats a concurrent DataReplacement as a conflict, but not the reverse. For Update/Delete, conflict when the other transaction's updated/removed fragment ids overlap our replacement fragment ids (mirrors the existing Rewrite handling). For Merge, which rewrites the entire fragment list, conflict unconditionally (mirrors check_merge_txn). All are retryable, so the committer rebuilds against the new layout. Adds DataReplacement vs Update, Delete, and Merge cases (same and different fragment) to test_conflicts_data_replacement. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- docs/src/format/table/transaction.md | 1 + .../src/dataset/tests/dataset_merge_update.rs | 3 +- rust/lance/src/io/commit/conflict_resolver.rs | 111 +++++++++++++++++- 3 files changed, 110 insertions(+), 5 deletions(-) diff --git a/docs/src/format/table/transaction.md b/docs/src/format/table/transaction.md index d1a5191bf54..436d857c36a 100644 --- a/docs/src/format/table/transaction.md +++ b/docs/src/format/table/transaction.md @@ -457,6 +457,7 @@ The following operations are retryable conflicts with DataReplacement: - CreateIndex (only if the field being replaced is being indexed) - Rewrite (only if overlapping fragments) - Update (only if overlapping fragments) +- Delete (only if overlapping fragments) - Merge (always) ### UpdateMemWalState diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index 7fa03d6e78d..c6f448040c2 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -1047,7 +1047,8 @@ async fn test_datafile_replacement_error() { Operation::DataReplacement { replacements: vec![DataReplacementGroup(0, new_data_file)], }, - Some(2), + // read at the current version (after the Merge above) + Some(dataset.manifest.version), None, None, Arc::new(Default::default()), diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index b242cd5b3dd..dc898534c89 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -904,13 +904,42 @@ impl<'a> TransactionRebase<'a> { match &other_transaction.operation { Operation::Append { .. } | Operation::Clone { .. } - | Operation::Delete { .. } - | Operation::Update { .. } - | Operation::Merge { .. } | Operation::UpdateConfig { .. } | Operation::ReserveFragments { .. } | Operation::Project { .. } | Operation::UpdateBases { .. } => Ok(()), + Operation::Merge { .. } => { + // Merge rewrites the whole fragment list; always conflict + // (symmetric with check_merge_txn). + Err(self.retryable_conflict_err(other_transaction, other_version)) + } + Operation::Update { + updated_fragments, + removed_fragment_ids, + .. + } + | Operation::Delete { + updated_fragments, + deleted_fragment_ids: removed_fragment_ids, + .. + } => { + // A concurrent Update/Delete that changed one of our target + // fragments makes our positional column file stale; conflict so + // the committer rebuilds (lance otherwise accepts it silently). + for replacement in replacements { + let touches_our_fragment = updated_fragments + .iter() + .map(|f| f.id) + .chain(removed_fragment_ids.iter().copied()) + .any(|id| id == replacement.0); + if touches_our_fragment { + return Err( + self.retryable_conflict_err(other_transaction, other_version) + ); + } + } + Ok(()) + } Operation::CreateIndex { new_indices, .. } => { // A data replacement only conflicts if it is updating the field that // is being indexed. @@ -3258,7 +3287,7 @@ mod tests { ( "DataReplacement vs Rewrite on different fragment", Operation::DataReplacement { - replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01)], + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], }, Operation::Rewrite { groups: vec![RewriteGroup { @@ -3270,6 +3299,80 @@ mod tests { }, Compatible, ), + // A concurrent Update/Delete on a fragment we replace a column in must + // conflict, else the stale positional file is applied silently. + ( + "DataReplacement vs Update on same fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Update { + updated_fragments: vec![Fragment::new(0)], + removed_fragment_ids: vec![], + new_fragments: vec![], + fields_modified: vec![], + merged_generations: Vec::new(), + fields_for_preserving_frag_bitmap: vec![], + update_mode: None, + inserted_rows_filter: None, + updated_fragment_offsets: None, + }, + Retryable, + ), + ( + "DataReplacement vs Update on different fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Update { + updated_fragments: vec![Fragment::new(1)], + removed_fragment_ids: vec![], + new_fragments: vec![], + fields_modified: vec![], + merged_generations: Vec::new(), + fields_for_preserving_frag_bitmap: vec![], + update_mode: None, + inserted_rows_filter: None, + updated_fragment_offsets: None, + }, + Compatible, + ), + ( + "DataReplacement vs Delete on same fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Delete { + deleted_fragment_ids: vec![], + updated_fragments: vec![Fragment::new(0)], + predicate: "a > 0".to_string(), + }, + Retryable, + ), + ( + "DataReplacement vs Delete that removes the fragment", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01.clone())], + }, + Operation::Delete { + deleted_fragment_ids: vec![0], + updated_fragments: vec![], + predicate: "a > 0".to_string(), + }, + Retryable, + ), + // Merge rewrites the whole fragment list -> always conflicts. + ( + "DataReplacement vs Merge", + Operation::DataReplacement { + replacements: vec![DataReplacementGroup(0, data_file_frag0_fields01)], + }, + Operation::Merge { + fragments: vec![Fragment::new(0)], + schema: lance_core::datatypes::Schema::default(), + }, + Retryable, + ), ]; for (description, op1, op2, expected) in cases { From 9f26271d0b9b02afa99861e1fea2009cdd74c75a Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Tue, 23 Jun 2026 10:05:46 +0800 Subject: [PATCH 165/177] perf(fts): prune low-scoring conjunction candidates (#7386) ## Performance Improvement ### What is the performance issue or bottleneck? For FTS conjunction searches, once the top-k threshold is established, the AND path can still fully validate and score aligned candidate documents even when a cheap upper bound proves they cannot enter the heap. That pays for full BM25 scoring, phrase checks, and frequency collection for candidates that are already below the competitive threshold. ### How does this PR improve performance? This adds an AND-only score-first candidate prune in `Wand::search`. After all conjunction postings are aligned, the scorer first computes the exact contribution of one lead posting, then adds the remaining postings' current block-max scores as a safe upper bound. If that upper bound cannot beat the threshold, the candidate is skipped before phrase validation, full scoring, and term-frequency collection. The change is intentionally narrow: - OR and flat-search paths are unchanged. - Missing-term and fuzzy AND semantics are unchanged. - The bound uses existing block-max scores, so exact top-k behavior is preserved for `wand_factor == 1.0`. - Phrase queries still use the prune only when the BM25 upper bound is already non-competitive. ### Benchmark or measurement results No end-to-end benchmark was run for this draft. The new regression coverage includes a counting scorer case that verifies low-scoring AND candidates avoid full scoring, plus a top-k correctness case that keeps a later high-scoring candidate. ## Validation - `cargo fmt --all --check` - `git diff --check` - `CARGO_TARGET_DIR=/tmp/lance-target-fts-and-prune-main cargo test -p lance-index scalar::inverted::wand::tests -- --nocapture` - `CARGO_TARGET_DIR=/tmp/lance-target-fts-and-prune-clippy cargo clippy --all --tests --benches -- -D warnings` --- rust/lance-index/src/metrics.rs | 19 ++ rust/lance-index/src/scalar/inverted/wand.rs | 281 ++++++++++++++++++- rust/lance/src/io/exec/fts.rs | 30 +- 3 files changed, 327 insertions(+), 3 deletions(-) diff --git a/rust/lance-index/src/metrics.rs b/rust/lance-index/src/metrics.rs index 37e2c43d196..8c0c119a3c3 100644 --- a/rust/lance-index/src/metrics.rs +++ b/rust/lance-index/src/metrics.rs @@ -3,6 +3,11 @@ use std::sync::atomic::{AtomicUsize, Ordering}; +pub const AND_CANDIDATES_SEEN_METRIC: &str = "and_candidates_seen"; +pub const AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC: &str = "and_candidates_pruned_before_return"; +pub const AND_FULL_SCORES_METRIC: &str = "and_full_scores"; +pub const FREQS_COLLECTED_METRIC: &str = "freqs_collected"; + /// A trait used by the index to report metrics /// /// Callers can implement this trait to collect metrics @@ -44,6 +49,20 @@ pub trait MetricsCollector: Send + Sync { /// The goal is to provide some visibility into the compute cost of the search fn record_comparisons(&self, num_comparisons: usize); + /// Record AND candidates returned from WAND alignment to the scoring loop. + /// + /// This excludes candidates pruned before `next()` returns. Use this with + /// `record_and_candidates_pruned_before_return` to recover total aligned + /// AND candidates. + fn record_and_candidates_seen(&self, _num_candidates: usize) {} + + /// Record AND candidates pruned during WAND alignment before `next()` returns. + fn record_and_candidates_pruned_before_return(&self, _num_candidates: usize) {} + + fn record_and_full_scores(&self, _num_scores: usize) {} + + fn record_freqs_collected(&self, _num_collections: usize) {} + /// Returns an optional sink for recording exact I/O statistics (bytes read, /// IOPS, and requests) performed on behalf of this collector. /// diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index d693bef7877..485aa99dced 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -588,6 +588,7 @@ pub struct Wand<'a, S: Scorer> { // Last conjunction doc returned to the caller. The next conjunction search // resumes strictly after this doc, like Lucene's `nextDoc()/advance()`. and_last_doc: Option, + and_candidates_pruned_before_return: usize, docs: &'a DocSet, scorer: S, // Shared cross-partition top-k floor. Each partition publishes its local @@ -650,6 +651,7 @@ impl<'a, S: Scorer> Wand<'a, S> { up_to: None, and_max_score: f32::INFINITY, and_last_doc: None, + and_candidates_pruned_before_return: 0, docs, scorer, shared_threshold: None, @@ -719,12 +721,19 @@ impl<'a, S: Scorer> Wand<'a, S> { let mut candidates = BinaryHeap::with_capacity(std::cmp::min(limit, BLOCK_SIZE * 10)); let mut num_comparisons = 0; + let mut and_candidates_seen = 0; + let mut and_full_scores = 0; + let mut freqs_collected = 0; + let pruned_before_return_start = self.and_candidates_pruned_before_return; loop { self.raise_to_shared_floor(params.wand_factor); let Some((doc, mut score)) = self.next()? else { break; }; num_comparisons += 1; + if self.operator == Operator::And { + and_candidates_seen += 1; + } // Either a real row_id (so we can run the mask check // inline) or the doc_id widened to u64 (deferred path; @@ -777,11 +786,15 @@ impl<'a, S: Scorer> Wand<'a, S> { { continue; } + and_full_scores += 1; self.score(doc_length) }; if candidates.len() < limit { let freqs = self.iter_term_freqs().collect(); + if self.operator == Operator::And { + freqs_collected += 1; + } candidates.push(Reverse(( ScoredDoc::new(row_id, score), freqs, @@ -794,6 +807,9 @@ impl<'a, S: Scorer> Wand<'a, S> { } } else if score > candidates.peek().unwrap().0.0.score.0 { let freqs = self.iter_term_freqs().collect(); + if self.operator == Operator::And { + freqs_collected += 1; + } candidates.pop(); candidates.push(Reverse(( ScoredDoc::new(row_id, score), @@ -809,6 +825,13 @@ impl<'a, S: Scorer> Wand<'a, S> { } } metrics.record_comparisons(num_comparisons); + let and_candidates_pruned_before_return = self + .and_candidates_pruned_before_return + .saturating_sub(pruned_before_return_start); + metrics.record_and_candidates_seen(and_candidates_seen); + metrics.record_and_candidates_pruned_before_return(and_candidates_pruned_before_return); + metrics.record_and_full_scores(and_full_scores); + metrics.record_freqs_collected(freqs_collected); // The heap entry's `row_id` slot is either a real row_id // (DocSet had row_ids) or the doc_id widened to u64 @@ -982,6 +1005,30 @@ impl<'a, S: Scorer> Wand<'a, S> { }) } + fn and_candidate_cannot_beat_threshold(&self, doc_length: u32) -> bool { + if self.operator != Operator::And + || self.threshold <= 0.0 + || self.num_terms < 2 + || self.lead.len() != self.num_terms + { + return false; + } + + let Some((first, remaining)) = self.lead.split_first() else { + return false; + }; + let Some(doc) = first.doc() else { + return false; + }; + + let remaining_upper_bound = remaining + .iter() + .map(|posting| posting.block_max_score()) + .sum::(); + first.score(&self.scorer, doc.frequency(), doc_length) + remaining_upper_bound + <= self.threshold + } + // find the next doc candidate // Find the next term-level candidate doc. The returned score is the exact // contribution from the current `lead` set; additional score can still come @@ -1100,8 +1147,23 @@ impl<'a, S: Scorer> Wand<'a, S> { } } + let lead_doc = self.lead.first().and_then(|posting| posting.doc())?; + let doc_length = match &lead_doc { + DocInfo::Raw(doc) => self.docs.num_tokens(doc.doc_id), + DocInfo::Located(doc) => self.docs.num_tokens_by_row_id(doc.row_id), + }; + if self.and_candidate_cannot_beat_threshold(doc_length) { + self.and_candidates_pruned_before_return += 1; + let next_target = self.and_advance_target(doc.saturating_add(1)); + if next_target == TERMINATED_DOC_ID { + return None; + } + self.lead[0].next(next_target); + continue; + } + self.and_last_doc = Some(doc); - return self.lead.first().and_then(|posting| posting.doc()); + return Some(lead_doc); } } @@ -1661,7 +1723,7 @@ mod tests { use super::*; use crate::scalar::inverted::scorer::IndexBM25Scorer; use crate::{ - metrics::NoOpMetricsCollector, + metrics::{MetricsCollector, NoOpMetricsCollector}, scalar::inverted::{ CompressedPostingList, PlainPostingList, PostingListBuilder, builder::PositionRecorder, encoding::compress_posting_list, @@ -1721,6 +1783,42 @@ mod tests { } } + #[derive(Default)] + struct CountAndSearchStats { + comparisons: AtomicUsize, + candidates_seen: AtomicUsize, + candidates_pruned_before_return: AtomicUsize, + full_scores: AtomicUsize, + freqs_collected: AtomicUsize, + } + + impl MetricsCollector for CountAndSearchStats { + fn record_parts_loaded(&self, _: usize) {} + + fn record_index_loads(&self, _: usize) {} + + fn record_comparisons(&self, n: usize) { + self.comparisons.fetch_add(n, Ordering::Relaxed); + } + + fn record_and_candidates_seen(&self, n: usize) { + self.candidates_seen.fetch_add(n, Ordering::Relaxed); + } + + fn record_and_candidates_pruned_before_return(&self, n: usize) { + self.candidates_pruned_before_return + .fetch_add(n, Ordering::Relaxed); + } + + fn record_and_full_scores(&self, n: usize) { + self.full_scores.fetch_add(n, Ordering::Relaxed); + } + + fn record_freqs_collected(&self, n: usize) { + self.freqs_collected.fetch_add(n, Ordering::Relaxed); + } + } + fn generate_posting_list( doc_ids: Vec, max_score: f32, @@ -1728,6 +1826,17 @@ mod tests { is_compressed: bool, ) -> PostingList { let freqs = vec![1; doc_ids.len()]; + generate_posting_list_with_freqs(doc_ids, freqs, max_score, block_max_scores, is_compressed) + } + + fn generate_posting_list_with_freqs( + doc_ids: Vec, + freqs: Vec, + max_score: f32, + block_max_scores: Option>, + is_compressed: bool, + ) -> PostingList { + assert_eq!(doc_ids.len(), freqs.len()); let block_max_scores = block_max_scores.unwrap_or_else(|| vec![max_score; doc_ids.len()]); if is_compressed { let blocks = compress_posting_list( @@ -2160,6 +2269,174 @@ mod tests { assert_eq!(candidate.0.doc_id(), BLOCK_SIZE as u64); } + #[test] + fn test_and_candidate_prune_scores_first_term_before_full_score() { + let total_docs = 2 * BLOCK_SIZE as u32 + 1; + let mut docs = DocSet::default(); + for doc_id in 0..total_docs { + let doc_tokens = if doc_id == 0 { 1 } else { 1000 }; + docs.append(doc_id as u64, doc_tokens); + } + + let first_docs = (0..2 * BLOCK_SIZE as u32).collect::>(); + let second_docs = (0..total_docs).collect::>(); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("a"), + 0, + 0, + 1.0, + generate_posting_list(first_docs, 1.0, Some(vec![1.0, 0.001]), true), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("b"), + 1, + 1, + 1.0, + generate_posting_list(second_docs, 1.0, Some(vec![1.0, 0.001, 0.001]), true), + docs.len(), + ), + ]; + + let scored = Arc::new(AtomicUsize::new(0)); + let mut wand = Wand::new( + Operator::And, + postings.into_iter(), + &docs, + CountingScorer { + scored: scored.clone(), + }, + ); + + let result = wand + .search( + &FtsSearchParams::new().with_limit(Some(1)), + Arc::new(RowAddrMask::default()), + &NoOpMetricsCollector, + ) + .unwrap(); + + let addrs = result.into_iter().map(|doc| doc.addr).collect::>(); + assert!(matches!(addrs.as_slice(), [CandidateAddr::RowId(0)])); + let scored = scored.load(Ordering::Relaxed); + assert!( + scored <= BLOCK_SIZE + 1, + "expected candidate pruning to avoid full scoring in the first block, scored {scored}" + ); + } + + #[test] + fn test_and_candidate_prune_records_scoring_counters() { + let total_docs = 2 * BLOCK_SIZE as u32 + 1; + let mut docs = DocSet::default(); + for doc_id in 0..total_docs { + let doc_tokens = if doc_id == 0 { 1 } else { 1000 }; + docs.append(doc_id as u64, doc_tokens); + } + + let first_docs = (0..2 * BLOCK_SIZE as u32).collect::>(); + let second_docs = (0..total_docs).collect::>(); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("a"), + 0, + 0, + 1.0, + generate_posting_list(first_docs, 1.0, Some(vec![1.0, 0.001]), true), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("b"), + 1, + 1, + 1.0, + generate_posting_list(second_docs, 1.0, Some(vec![1.0, 0.001, 0.001]), true), + docs.len(), + ), + ]; + + let mut wand = Wand::new( + Operator::And, + postings.into_iter(), + &docs, + InverseDocLengthScorer, + ); + let metrics = CountAndSearchStats::default(); + let result = wand + .search( + &FtsSearchParams::new().with_limit(Some(1)), + Arc::new(RowAddrMask::default()), + &metrics, + ) + .unwrap(); + + let addrs = result.into_iter().map(|doc| doc.addr).collect::>(); + assert!(matches!(addrs.as_slice(), [CandidateAddr::RowId(0)])); + + let candidates_seen = metrics.candidates_seen.load(Ordering::Relaxed); + let candidates_pruned_before_return = metrics + .candidates_pruned_before_return + .load(Ordering::Relaxed); + let full_scores = metrics.full_scores.load(Ordering::Relaxed); + assert_eq!(metrics.comparisons.load(Ordering::Relaxed), 1); + assert_eq!(candidates_seen, 1); + assert!(candidates_pruned_before_return > 0); + assert_eq!(full_scores, 1); + assert_eq!(metrics.freqs_collected.load(Ordering::Relaxed), 1); + } + + #[test] + fn test_and_candidate_prune_keeps_later_high_score_candidate() { + let mut docs = DocSet::default(); + for doc_id in 0..3 { + docs.append(doc_id, 1); + } + + let postings = vec![ + PostingIterator::with_query_weight( + String::from("a"), + 0, + 0, + 1.0, + generate_posting_list_with_freqs( + vec![0, 1], + vec![10, 1], + 10.0, + Some(vec![10.0]), + true, + ), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("b"), + 1, + 1, + 1.0, + generate_posting_list_with_freqs( + vec![0, 1, 2], + vec![1, 20, 1], + 20.0, + Some(vec![20.0]), + true, + ), + docs.len(), + ), + ]; + + let mut wand = Wand::new(Operator::And, postings.into_iter(), &docs, UnitScorer); + let result = wand + .search( + &FtsSearchParams::new().with_limit(Some(1)), + Arc::new(RowAddrMask::default()), + &NoOpMetricsCollector, + ) + .unwrap(); + + let addrs = result.into_iter().map(|doc| doc.addr).collect::>(); + assert!(matches!(addrs.as_slice(), [CandidateAddr::RowId(1)])); + } + #[rstest] fn test_wand_batches_lagging_iterators(#[values(false, true)] is_compressed: bool) { let mut docs = DocSet::default(); diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index 76db9300d0f..c087d3289c0 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -37,7 +37,10 @@ use super::PreFilterSource; use super::utils::{IndexMetrics, build_prefilter}; use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::{Dataset, index::DatasetIndexInternalExt}; -use lance_index::metrics::MetricsCollector; +use lance_index::metrics::{ + AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC, AND_CANDIDATES_SEEN_METRIC, AND_FULL_SCORES_METRIC, + FREQS_COLLECTED_METRIC, MetricsCollector, +}; use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -159,6 +162,10 @@ fn default_text_tokenizer() -> Box { pub struct FtsIndexMetrics { index_metrics: IndexMetrics, partitions_searched: Count, + and_candidates_seen: Count, + and_candidates_pruned_before_return: Count, + and_full_scores: Count, + freqs_collected: Count, baseline_metrics: BaselineMetrics, } @@ -167,6 +174,11 @@ impl FtsIndexMetrics { Self { index_metrics: IndexMetrics::new(metrics, partition), partitions_searched: metrics.new_count(PARTITIONS_SEARCHED_METRIC, partition), + and_candidates_seen: metrics.new_count(AND_CANDIDATES_SEEN_METRIC, partition), + and_candidates_pruned_before_return: metrics + .new_count(AND_CANDIDATES_PRUNED_BEFORE_RETURN_METRIC, partition), + and_full_scores: metrics.new_count(AND_FULL_SCORES_METRIC, partition), + freqs_collected: metrics.new_count(FREQS_COLLECTED_METRIC, partition), baseline_metrics: BaselineMetrics::new(metrics, partition), } } @@ -188,6 +200,22 @@ impl MetricsCollector for FtsIndexMetrics { fn record_comparisons(&self, num_comparisons: usize) { self.index_metrics.record_comparisons(num_comparisons); } + + fn record_and_candidates_seen(&self, num_candidates: usize) { + self.and_candidates_seen.add(num_candidates); + } + + fn record_and_candidates_pruned_before_return(&self, num_candidates: usize) { + self.and_candidates_pruned_before_return.add(num_candidates); + } + + fn record_and_full_scores(&self, num_scores: usize) { + self.and_full_scores.add(num_scores); + } + + fn record_freqs_collected(&self, num_collections: usize) { + self.freqs_collected.add(num_collections); + } } #[derive(Debug)] From b1efc9da8ead500939ae218adfd7d12b9d8aeb18 Mon Sep 17 00:00:00 2001 From: ForwardXu Date: Tue, 23 Jun 2026 10:21:37 +0800 Subject: [PATCH 166/177] feat(namespace-dir): add alter table column operations (add/alter/drop) (#6273) --- rust/lance-namespace-impls/src/dir.rs | 427 +++++++++++++++- .../lance-namespace-impls/src/dir/manifest.rs | 465 +++++++++++++++++- 2 files changed, 890 insertions(+), 2 deletions(-) diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index e97c5c836b7..681cfa430f2 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -46,6 +46,8 @@ use std::sync::{Arc, Mutex}; use crate::context::DynamicContextProvider; use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, AnalyzeTableQueryPlanRequest, BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, BranchContents as ModelBranchContents, CountTableRowsRequest, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableBranchRequest, @@ -72,7 +74,7 @@ use lance_namespace::models::{ UpdateTableTagResponse, }; -use lance_core::{Error, Result}; +use lance_core::{Error, Result, box_error}; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::schema::arrow_schema_to_json; @@ -111,6 +113,70 @@ impl OpsMetrics { } } +/// Build SQL expression list for the add_columns operation. +/// Returns an explicit error when the expression is missing, instead of silently using an empty string. +pub(crate) fn build_sql_expressions( + new_columns: &[lance_namespace::models::AddColumnsEntry], +) -> Result> { + new_columns + .iter() + .map(|col| { + // expression is Option>: outer Option means whether the + // field is present, inner Option means whether the value is JSON null. + let expression = col.expression.clone().and_then(|opt| opt).ok_or_else(|| { + Error::invalid_input(format!( + "Expression is required for new column '{}'", + col.name + )) + })?; + Ok((col.name.clone(), expression)) + }) + .collect() +} + +/// Build column alteration list for the alter_columns operation. +/// Returns an explicit error when data_type conversion fails, instead of silently ignoring it. +pub(crate) fn build_column_alterations( + alterations: &[lance_namespace::models::AlterColumnsEntry], +) -> Result> { + alterations + .iter() + .map(|entry| { + let mut alteration = lance::dataset::ColumnAlteration::new(entry.path.clone()); + // rename is Option>: flatten to get the actual rename value. + if let Some(Some(rename)) = &entry.rename { + alteration = alteration.rename(rename.clone()); + } + // nullable is Option>: flatten to get the actual nullable value. + if let Some(Some(nullable)) = entry.nullable { + alteration = alteration.set_nullable(nullable); + } + // data_type is Option: only process when present and not null. + if let Some(data_type) = &entry.data_type + && !data_type.is_null() + { + let type_str = data_type.as_str().ok_or_else(|| { + Error::invalid_input(format!( + "data_type for column '{}' must be a JSON string, got: {}", + entry.path, data_type + )) + })?; + let json_type = + lance_namespace::models::JsonArrowDataType::new(type_str.to_string()); + let dt = + lance_namespace::schema::convert_json_arrow_type(&json_type).map_err(|e| { + Error::invalid_input(format!( + "Failed to parse data_type '{}' for column '{}': {}", + type_str, entry.path, e + )) + })?; + alteration = alteration.cast_to(dt); + } + Ok(alteration) + }) + .collect() +} + /// Result of checking table status atomically. /// /// This struct captures the state of a table directory in a single snapshot, @@ -2887,6 +2953,163 @@ impl LanceNamespace for DirectoryNamespace { }) } + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_add_columns(request).await; + } + + // Non-manifest mode: open Dataset directly via table URI and perform the operation + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let sql_expressions = build_sql_expressions(&request.new_columns)?; + + dataset + .add_columns( + lance::dataset::NewColumnTransform::SqlExpressions(sql_expressions), + None, + None, + ) + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to add columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAddColumnsResponse::new(version)) + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_alter_columns(request).await; + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let alterations = build_column_alterations(&request.alterations)?; + + dataset.alter_columns(&alterations).await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to alter columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAlterColumnsResponse::new(version)) + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + return manifest_ns.alter_table_drop_columns(request).await; + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status before opening the dataset + let status = self.check_table_status(&table_name).await; + if !status.exists { + return Err(NamespaceError::TableNotFound { + message: table_name, + } + .into()); + } + if status.is_deregistered { + return Err(NamespaceError::TableNotFound { + message: format!("Table is deregistered: {}", table_name), + } + .into()); + } + + let mut dataset = self + .configured_builder(&table_uri) + .load() + .await + .map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let columns: Vec<&str> = request.columns.iter().map(|s| s.as_str()).collect(); + dataset.drop_columns(&columns).await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to drop columns: {}", + e + )))) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableDropColumnsResponse::new(version)) + } + async fn list_table_versions( &self, request: ListTableVersionsRequest, @@ -4522,6 +4745,7 @@ mod tests { } #[derive(Debug)] + #[allow(dead_code)] struct CountingFileStoreProvider { listing_count: Arc, } @@ -4557,6 +4781,7 @@ mod tests { } } + #[allow(dead_code)] fn file_object_store_uri(path: &str) -> String { let file_url = uri_to_url(path).unwrap(); let mut url = Url::parse("file-object-store:///").unwrap(); @@ -4564,6 +4789,7 @@ mod tests { url.to_string() } + #[allow(dead_code)] fn build_listing_counting_session(listing_count: Arc) -> Arc { let registry = Arc::new(ObjectStoreRegistry::default()); registry.insert( @@ -10986,6 +11212,55 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_add_columns() { + use lance_namespace::models::{ + AddColumnsEntry, AlterTableAddColumnsRequest, DescribeTableRequest, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Add a new column + let mut new_col = AddColumnsEntry::new("doubled_id".to_string()); + new_col.expression = Some(Some("id * 2".to_string())); + let mut add_request = AlterTableAddColumnsRequest::new(vec![new_col]); + add_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_add_columns(add_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after adding columns" + ); + + // Verify via describe_table + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"doubled_id"), + "Column 'doubled_id' should exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_update_table_schema_metadata() { use lance_namespace::models::UpdateTableSchemaMetadataRequest; @@ -11013,6 +11288,72 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_add_columns_missing_id() { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let new_col = AddColumnsEntry::new("col".to_string()); + let request = AlterTableAddColumnsRequest::new(vec![new_col]); + let result = namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_alter_columns_rename() { + use lance_namespace::models::{ + AlterColumnsEntry, AlterTableAlterColumnsRequest, DescribeTableRequest, + }; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Rename "name" to "full_name" + let mut entry = AlterColumnsEntry::new("name".to_string()); + entry.rename = Some(Some("full_name".to_string())); + let mut alter_request = AlterTableAlterColumnsRequest::new(vec![entry]); + alter_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_alter_columns(alter_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after altering columns" + ); + + // Verify the rename + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"full_name"), + "Column should be renamed to 'full_name', got: {:?}", + field_names + ); + assert!( + !field_names.contains(&"name"), + "Old column 'name' should not exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_get_table_stats() { use lance_namespace::models::GetTableStatsRequest; @@ -11064,6 +11405,68 @@ mod tests { ); } + #[tokio::test] + async fn test_alter_table_alter_columns_missing_id() { + use lance_namespace::models::{AlterColumnsEntry, AlterTableAlterColumnsRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let entry = AlterColumnsEntry::new("name".to_string()); + let request = AlterTableAlterColumnsRequest::new(vec![entry]); + let result = namespace.alter_table_alter_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_drop_columns() { + use lance_namespace::models::{AlterTableDropColumnsRequest, DescribeTableRequest}; + + let (namespace, _temp_dir) = create_test_namespace().await; + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_request, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Drop the "name" column + let mut drop_request = AlterTableDropColumnsRequest::new(vec!["name".to_string()]); + drop_request.id = Some(vec!["test_table".to_string()]); + + let response = namespace + .alter_table_drop_columns(drop_request) + .await + .unwrap(); + assert!( + response.version > 1, + "Version should increment after dropping columns" + ); + + // Verify column was dropped + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = namespace.describe_table(describe_request).await.unwrap(); + assert!(describe_response.schema.is_some()); + + let resp_schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = resp_schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + !field_names.contains(&"name"), + "Column 'name' should be dropped, got: {:?}", + field_names + ); + assert!( + field_names.contains(&"id"), + "Column 'id' should still exist, got: {:?}", + field_names + ); + } + #[tokio::test] async fn test_analyze_table_query_plan() { use lance_namespace::models::AnalyzeTableQueryPlanRequest; @@ -12023,4 +12426,26 @@ mod tests { err ); } + #[tokio::test] + async fn test_alter_table_drop_columns_missing_id() { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + let result = namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[tokio::test] + async fn test_alter_table_drop_columns_nonexistent_table() { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let (namespace, _temp_dir) = create_test_namespace().await; + + let mut request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } } diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index aae924378da..c05db91ad56 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -31,7 +31,7 @@ use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; use lance_core::Error as LanceError; use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; -use lance_core::{Error, ROW_ID, Result}; +use lance_core::{Error, ROW_ID, Result, box_error}; use lance_index::progress::noop_progress; use lance_index::registry::IndexPluginRegistry; use lance_index::scalar::lance_format::LanceIndexStore; @@ -42,6 +42,8 @@ use lance_io::stream::RecordBatchStream as LanceRecordBatchStream; use lance_namespace::LanceNamespace; use lance_namespace::error::NamespaceError; use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, @@ -3564,6 +3566,174 @@ impl LanceNamespace for ManifestNamespace { ..Default::default() }) } + + /// Add columns to a table. + /// + /// Converts the API `AddColumnsEntry` (SQL expressions) into Lance's + /// `NewColumnTransform::SqlExpressions` and delegates to `Dataset::add_columns`. + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; + + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } + + let object_id = Self::str_object_id(table_id); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; + + match table_info { + Some(info) => { + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + // Use DatasetBuilder with storage options to align with describe_table + // and to support custom storage backends (e.g. S3 with custom endpoints). + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + // Use shared helper to build SQL expressions, ensuring a clear error when expression is missing + let sql_expressions = super::build_sql_expressions(&request.new_columns)?; + + dataset + .add_columns( + lance::dataset::NewColumnTransform::SqlExpressions(sql_expressions), + None, + None, + ) + .await + .map_err(|e| { + // Surface specific commit/conflict errors (CommitConflict, + // RetryableCommitConflict, IncompatibleTransaction, ...) rather than + // collapsing every failure into a generic IO error. + convert_lance_commit_error(&e, "add_columns", Some(&object_id)) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAddColumnsResponse::new(version)) + } + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), + } + } + + /// Alter columns in a table (rename, change type, change nullability). + /// + /// Converts the API `AlterColumnsEntry` into Lance's `ColumnAlteration` + /// and delegates to `Dataset::alter_columns`. + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; + + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } + + let object_id = Self::str_object_id(table_id); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; + + match table_info { + Some(info) => { + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + // Use shared helper to build column alterations, ensuring a clear error when data_type conversion fails + let alterations = super::build_column_alterations(&request.alterations)?; + + dataset.alter_columns(&alterations).await.map_err(|e| { + convert_lance_commit_error(&e, "alter_columns", Some(&object_id)) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableAlterColumnsResponse::new(version)) + } + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), + } + } + + /// Drop columns from a table. + /// + /// Delegates to `Dataset::drop_columns` with the column names from the request. + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; + + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } + + let object_id = Self::str_object_id(table_id); + let table_info = self.query_manifest_for_table(&object_id).boxed().await?; + + match table_info { + Some(info) => { + let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let mut builder = DatasetBuilder::from_uri(&table_uri); + if let Some(opts) = &self.storage_options { + builder = builder.with_storage_options(opts.clone()); + } + if let Some(session) = &self.session { + builder = builder.with_session(session.clone()); + } + let mut dataset = builder.load().await.map_err(|e| { + Error::io_source(box_error(std::io::Error::other(format!( + "Failed to open dataset: {}", + e + )))) + })?; + + let columns: Vec<&str> = request.columns.iter().map(|s| s.as_str()).collect(); + dataset.drop_columns(&columns).await.map_err(|e| { + convert_lance_commit_error(&e, "drop_columns", Some(&object_id)) + })?; + + let version = dataset.version().version as i64; + Ok(AlterTableDropColumnsResponse::new(version)) + } + None => Err(NamespaceError::TableNotFound { message: object_id }.into()), + } + } } #[cfg(test)] @@ -5554,4 +5724,297 @@ mod tests { assert_eq!(n, names(&["c", "d"])); assert_eq!(next, Some("d".to_string())); } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns(#[case] inline_optimization: bool) { + use lance_namespace::models::{ + AddColumnsEntry, AlterTableAddColumnsRequest, DescribeTableRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table with id and name columns + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Add a new column using SQL expression + let mut new_col = AddColumnsEntry::new("doubled_id".to_string()); + new_col.expression = Some(Some("id * 2".to_string())); + let mut add_request = AlterTableAddColumnsRequest::new(vec![new_col]); + add_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_add_columns(add_request) + .await + .unwrap(); + // Version should have incremented + assert!(response.version > 1); + + // Verify the column was added by describing the table with detailed metadata + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"doubled_id"), + "Column 'doubled_id' should exist after add_columns, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Request without ID should fail + let new_col = AddColumnsEntry::new("col".to_string()); + let request = AlterTableAddColumnsRequest::new(vec![new_col]); + let result = dir_namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_add_columns_nonexistent_table(#[case] inline_optimization: bool) { + use lance_namespace::models::{AddColumnsEntry, AlterTableAddColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Request with non-existent table should fail + let new_col = AddColumnsEntry::new("col".to_string()); + let mut request = AlterTableAddColumnsRequest::new(vec![new_col]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.alter_table_add_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_alter_columns_rename(#[case] inline_optimization: bool) { + use lance_namespace::models::{ + AlterColumnsEntry, AlterTableAlterColumnsRequest, DescribeTableRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Rename the "name" column to "full_name" + let mut entry = AlterColumnsEntry::new("name".to_string()); + entry.rename = Some(Some("full_name".to_string())); + let mut alter_request = AlterTableAlterColumnsRequest::new(vec![entry]); + alter_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_alter_columns(alter_request) + .await + .unwrap(); + assert!(response.version > 1); + + // Verify the column was renamed + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + field_names.contains(&"full_name"), + "Column should be renamed to 'full_name', got: {:?}", + field_names + ); + assert!( + !field_names.contains(&"name"), + "Old column name 'name' should no longer exist, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_alter_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::{AlterColumnsEntry, AlterTableAlterColumnsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let entry = AlterColumnsEntry::new("name".to_string()); + let request = AlterTableAlterColumnsRequest::new(vec![entry]); + let result = dir_namespace.alter_table_alter_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns(#[case] inline_optimization: bool) { + use lance_namespace::models::{AlterTableDropColumnsRequest, DescribeTableRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a table with id and name columns + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Drop the "name" column + let mut drop_request = AlterTableDropColumnsRequest::new(vec!["name".to_string()]); + drop_request.id = Some(vec!["test_table".to_string()]); + + let response = dir_namespace + .alter_table_drop_columns(drop_request) + .await + .unwrap(); + assert!(response.version > 1); + + // Verify the column was dropped + let mut describe_request = DescribeTableRequest::new(); + describe_request.id = Some(vec!["test_table".to_string()]); + describe_request.load_detailed_metadata = Some(true); + let describe_response = dir_namespace + .describe_table(describe_request) + .await + .unwrap(); + assert!(describe_response.schema.is_some()); + + let schema = describe_response.schema.unwrap(); + let field_names: Vec<&str> = schema.fields.iter().map(|f| f.name.as_str()).collect(); + assert!( + !field_names.contains(&"name"), + "Column 'name' should have been dropped, got: {:?}", + field_names + ); + assert!( + field_names.contains(&"id"), + "Column 'id' should still exist, got: {:?}", + field_names + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns_missing_id(#[case] inline_optimization: bool) { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + let result = dir_namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table ID is missing"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_alter_table_drop_columns_nonexistent_table(#[case] inline_optimization: bool) { + use lance_namespace::models::AlterTableDropColumnsRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let mut request = AlterTableDropColumnsRequest::new(vec!["col".to_string()]); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.alter_table_drop_columns(request).await; + assert!(result.is_err(), "Should fail when table does not exist"); + } } From 7b9ea600decdaa65e97ad67a8046f481ab608246 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Tue, 23 Jun 2026 11:24:26 +0800 Subject: [PATCH 167/177] fix(index): use range block max for fts conjunction (#7387) ## Bug Fix What is the bug? FTS AND/conjunction block-max pruning could only ask a posting for its current block max score. When the lead posting defines a wider `up_to` window, another posting can have a higher block max later in that same window, so using only its current block can understate the safe upper bound for Lucene-style `getMaxScore(upTo)`. What issues or incorrect behavior does the bug cause? The understated upper bound can make `and_advance_target` skip a lead block window even though a later document in that window could still beat the current top-k threshold. For exact BM25 search, pruning must use a safe upper bound so possible top-k documents are not dropped. How does this PR fix the problem? This adds a query-time `BlockMaxWindow` to compressed posting iterators. The window lazily maintains a monotonic deque of block max scores over `[current shallow block, block containing up_to]`. AND/conjunction now lets the lead posting choose `up_to` and asks each follower for a range max that safely covers that same `up_to`. Plain postings still fall back to their existing list-level upper bound. This does not change the index format or posting-list build path. ## Tests - `cargo fmt --all --check` - `git diff --check` - `CARGO_TARGET_DIR=/tmp/lance-target-fts-and-rangemax-main cargo test -p lance-index scalar::inverted::wand::tests -- --nocapture` - `CARGO_TARGET_DIR=/tmp/lance-target-fts-and-rangemax-clippy cargo clippy --all --tests --benches -- -D warnings` --------- Co-authored-by: Lu Qiu --- rust/lance-index/src/scalar/inverted/wand.rs | 451 ++++++++++++++++++- 1 file changed, 438 insertions(+), 13 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 485aa99dced..0fe0bb0349b 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -4,7 +4,10 @@ use std::ops::Deref; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, LazyLock}; -use std::{cell::UnsafeCell, collections::BinaryHeap}; +use std::{ + cell::UnsafeCell, + collections::{BinaryHeap, VecDeque}, +}; use std::{cmp::Reverse, fmt::Debug}; use arrow::array::AsArray; @@ -35,7 +38,6 @@ use super::{ use super::{DocInfo, builder::BLOCK_SIZE}; const TERMINATED_DOC_ID: u64 = u64::MAX; - pub static FLAT_SEARCH_PERCENT_THRESHOLD: LazyLock = LazyLock::new(|| { std::env::var("LANCE_FLAT_SEARCH_PERCENT_THRESHOLD") .unwrap_or_else(|_| "10".to_string()) @@ -68,6 +70,7 @@ struct CompressedState { position_block_idx: Option, position_values: Vec, position_offsets: Vec, + block_max_window: BlockMaxWindow, } impl CompressedState { @@ -80,6 +83,7 @@ impl CompressedState { position_block_idx: None, position_values: Vec::new(), position_offsets: Vec::new(), + block_max_window: BlockMaxWindow::new(), } } @@ -114,6 +118,91 @@ impl CompressedState { } } +#[derive(Clone)] +struct BlockMaxWindow { + // Sliding block range used for Lucene-style getMaxScore(upTo). The deque is + // monotonic by score and covers blocks in [start_block_idx, next_block_idx). + start_block_idx: usize, + next_block_idx: usize, + max_scores: VecDeque<(usize, f32)>, +} + +struct BlockMaxScore { + score: f32, + blocks_scanned: usize, +} + +impl BlockMaxWindow { + fn new() -> Self { + Self { + start_block_idx: 0, + next_block_idx: 0, + max_scores: VecDeque::new(), + } + } + + fn reset(&mut self, start_block_idx: usize) { + self.start_block_idx = start_block_idx; + self.next_block_idx = start_block_idx; + self.max_scores.clear(); + } + + fn max_score_up_to( + &mut self, + list: &CompressedPostingList, + start_block_idx: usize, + up_to: u64, + ) -> BlockMaxScore { + if start_block_idx >= list.blocks.len() { + self.reset(start_block_idx); + return BlockMaxScore { + score: 0.0, + blocks_scanned: 0, + }; + } + if start_block_idx < self.start_block_idx || start_block_idx > self.next_block_idx { + self.reset(start_block_idx); + } + self.start_block_idx = start_block_idx; + while matches!(self.max_scores.front(), Some((block_idx, _)) if *block_idx < start_block_idx) + { + self.max_scores.pop_front(); + } + + if list.block_least_doc_id(start_block_idx) as u64 > up_to { + self.reset(start_block_idx); + return BlockMaxScore { + score: 0.0, + blocks_scanned: 0, + }; + } + + self.next_block_idx = self.next_block_idx.max(start_block_idx); + let mut blocks_scanned = 0; + while self.next_block_idx < list.blocks.len() + && list.block_least_doc_id(self.next_block_idx) as u64 <= up_to + { + let score = list.block_max_score(self.next_block_idx); + while matches!(self.max_scores.back(), Some((_, old_score)) if *old_score <= score) { + self.max_scores.pop_back(); + } + self.max_scores.push_back((self.next_block_idx, score)); + self.next_block_idx += 1; + blocks_scanned += 1; + } + + let score = self + .max_scores + .front() + .map(|(_, score)| *score) + .unwrap_or(0.0); + BlockMaxScore { + score, + blocks_scanned, + } + } +} + impl Debug for PostingIterator { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("PostingIterator") @@ -412,6 +501,27 @@ impl PostingIterator { } } + #[inline] + fn block_max_score_up_to_with_stats(&mut self, up_to: u64) -> BlockMaxScore { + match self.list { + PostingList::Compressed(ref list) => { + let compressed = unsafe { &mut *self.compressed_state_ptr() }; + compressed + .block_max_window + .max_score_up_to(list, self.block_idx, up_to) + } + PostingList::Plain(_) => BlockMaxScore { + score: self.approximate_upper_bound, + blocks_scanned: 0, + }, + } + } + + #[inline] + fn is_compressed(&self) -> bool { + matches!(self.list, PostingList::Compressed(_)) + } + fn block_first_doc(&self) -> Option { match self.list { PostingList::Compressed(ref list) => { @@ -540,6 +650,15 @@ impl PartialEq for TailPosting { } } +#[derive(Default)] +struct AndWindowStats { + windows_wide: usize, + windows_narrow: usize, + windows_skipped: usize, + range_blocks_scanned: usize, + candidates_returned: usize, +} + impl Eq for TailPosting {} impl PartialOrd for TailPosting { @@ -588,6 +707,7 @@ pub struct Wand<'a, S: Scorer> { // Last conjunction doc returned to the caller. The next conjunction search // resumes strictly after this doc, like Lucene's `nextDoc()/advance()`. and_last_doc: Option, + and_window_stats: AndWindowStats, and_candidates_pruned_before_return: usize, docs: &'a DocSet, scorer: S, @@ -651,6 +771,7 @@ impl<'a, S: Scorer> Wand<'a, S> { up_to: None, and_max_score: f32::INFINITY, and_last_doc: None, + and_window_stats: AndWindowStats::default(), and_candidates_pruned_before_return: 0, docs, scorer, @@ -824,6 +945,16 @@ impl<'a, S: Scorer> Wand<'a, S> { self.push_back_leads(doc.doc_id() + 1); } } + if self.operator == Operator::And { + tracing::debug!( + and_windows_wide = self.and_window_stats.windows_wide, + and_windows_narrow = self.and_window_stats.windows_narrow, + and_windows_skipped = self.and_window_stats.windows_skipped, + and_range_blocks_scanned = self.and_window_stats.range_blocks_scanned, + and_candidates_returned = self.and_window_stats.candidates_returned, + "fts conjunction block-max window stats" + ); + } metrics.record_comparisons(num_comparisons); let and_candidates_pruned_before_return = self .and_candidates_pruned_before_return @@ -1035,7 +1166,11 @@ impl<'a, S: Scorer> Wand<'a, S> { // from `tail` iterators that are advanced to the same doc later. fn next(&mut self) -> Result> { if self.operator == Operator::And { - return Ok(self.next_and_candidate().map(|doc| (doc, 0.0))); + let candidate = self.next_and_candidate(); + if candidate.is_some() { + self.and_window_stats.candidates_returned += 1; + } + return Ok(candidate.map(|doc| (doc, 0.0))); } while let Some(target) = self.head_doc() { @@ -1167,6 +1302,14 @@ impl<'a, S: Scorer> Wand<'a, S> { } } + fn posting_block_up_to(posting: &PostingIterator, target: u64) -> u64 { + posting + .next_block_first_doc() + .map(|doc| doc.saturating_sub(1)) + .unwrap_or(TERMINATED_DOC_ID) + .max(target) + } + fn and_move_to_next_block(&mut self, target: u64) { if self.threshold <= 0.0 { self.up_to = Some(target); @@ -1174,19 +1317,65 @@ impl<'a, S: Scorer> Wand<'a, S> { return; } - let mut up_to = TERMINATED_DOC_ID; - let mut max_score = 0.0; + if self.lead.is_empty() { + self.up_to = Some(TERMINATED_DOC_ID); + self.and_max_score = 0.0; + return; + } + for posting in &mut self.lead { posting.shallow_next(target); - let block_end = posting - .next_block_first_doc() - .map(|doc| doc.saturating_sub(1)) - .unwrap_or(TERMINATED_DOC_ID); - up_to = up_to.min(block_end.max(target)); - max_score += posting.block_max_score(); } - self.up_to = Some(up_to); - self.and_max_score = max_score; + + let narrow_up_to = self + .lead + .iter() + .map(|posting| Self::posting_block_up_to(posting, target)) + .min() + .unwrap_or(TERMINATED_DOC_ID); + let narrow_max_score = self + .lead + .iter() + .map(|posting| posting.block_max_score()) + .sum::(); + + if narrow_max_score >= self.threshold { + self.up_to = Some(narrow_up_to); + self.and_max_score = narrow_max_score; + self.and_window_stats.windows_narrow += 1; + return; + } + + let lead_up_to = self + .lead + .first() + .map(|posting| Self::posting_block_up_to(posting, target)) + .unwrap_or(TERMINATED_DOC_ID); + let can_try_wide = lead_up_to > narrow_up_to + && lead_up_to != TERMINATED_DOC_ID + && self.lead.iter().all(|posting| posting.is_compressed()); + + if can_try_wide { + let mut wide_max_score = 0.0; + let mut range_blocks_scanned = 0; + for posting in &mut self.lead { + let block_max = posting.block_max_score_up_to_with_stats(lead_up_to); + wide_max_score += block_max.score; + range_blocks_scanned += block_max.blocks_scanned; + } + self.and_window_stats.range_blocks_scanned += range_blocks_scanned; + + if wide_max_score < self.threshold { + self.up_to = Some(lead_up_to); + self.and_max_score = wide_max_score; + self.and_window_stats.windows_wide += 1; + return; + } + } + + self.up_to = Some(narrow_up_to); + self.and_max_score = narrow_max_score; + self.and_window_stats.windows_narrow += 1; } fn and_advance_target(&mut self, mut target: u64) -> u64 { @@ -1201,6 +1390,7 @@ impl<'a, S: Scorer> Wand<'a, S> { if self.and_max_score >= self.threshold { return target; } + self.and_window_stats.windows_skipped += 1; if up_to == TERMINATED_DOC_ID { return TERMINATED_DOC_ID; } @@ -1900,6 +2090,18 @@ mod tests { } } + fn sorted_candidate_row_ids(candidates: Vec) -> Vec { + let mut row_ids = candidates + .into_iter() + .map(|candidate| match candidate.addr { + CandidateAddr::RowId(row_id) => row_id, + CandidateAddr::Pending(doc_id) => doc_id as u64, + }) + .collect::>(); + row_ids.sort_unstable(); + row_ids + } + #[rstest] #[tokio::test] async fn test_wand(#[values(false, true)] is_compressed: bool) { @@ -2269,6 +2471,229 @@ mod tests { assert_eq!(candidate.0.doc_id(), BLOCK_SIZE as u64); } + #[test] + fn test_and_advance_falls_back_to_narrow_when_range_max_loosens_bound() { + let total = 4 * BLOCK_SIZE as u32; + let mut docs = DocSet::default(); + for i in 0..total { + docs.append(i as u64, 1); + } + + let lead_docs = (0..total).step_by(2).collect::>(); + let follower_docs = (0..total).collect::>(); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("lead"), + 0, + 0, + 1.0, + generate_posting_list(lead_docs, 1.0, Some(vec![1.0, 1.0]), true), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("follower"), + 1, + 1, + 1.0, + generate_posting_list(follower_docs, 10.0, Some(vec![0.1, 10.0, 0.1, 0.1]), true), + docs.len(), + ), + ]; + + let mut wand = Wand::new(Operator::And, postings.into_iter(), &docs, UnitScorer); + wand.threshold = 5.0; + + let target = wand.and_advance_target(0); + + assert_eq!(target, BLOCK_SIZE as u64); + assert_eq!(wand.up_to, Some((2 * BLOCK_SIZE - 1) as u64)); + assert!( + (wand.and_max_score - 11.0).abs() < 1e-6, + "expected the second narrow window to include the high follower block, got {}", + wand.and_max_score + ); + assert_eq!(wand.and_window_stats.windows_wide, 0); + assert_eq!(wand.and_window_stats.windows_narrow, 2); + assert_eq!(wand.and_window_stats.windows_skipped, 1); + } + + #[test] + fn test_and_advance_uses_narrow_window_for_candidate_ranges() { + let total = 4 * BLOCK_SIZE as u32; + let mut docs = DocSet::default(); + for i in 0..total { + docs.append(i as u64, 1); + } + + let lead_docs = (0..total).step_by(2).collect::>(); + let follower_docs = (0..total).collect::>(); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("lead"), + 0, + 0, + 1.0, + generate_posting_list(lead_docs, 1.0, Some(vec![1.0, 1.0]), true), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("follower"), + 1, + 1, + 1.0, + generate_posting_list(follower_docs, 1.0, Some(vec![1.0, 1.0, 1.0, 1.0]), true), + docs.len(), + ), + ]; + + let mut wand = Wand::new(Operator::And, postings.into_iter(), &docs, UnitScorer); + wand.threshold = 1.5; + + let target = wand.and_advance_target(0); + + assert_eq!(target, 0); + assert_eq!(wand.up_to, Some((BLOCK_SIZE - 1) as u64)); + assert!((wand.and_max_score - 2.0).abs() < 1e-6); + assert_eq!(wand.and_window_stats.windows_wide, 0); + assert_eq!(wand.and_window_stats.windows_narrow, 1); + assert_eq!(wand.and_window_stats.range_blocks_scanned, 0); + } + + #[test] + fn test_and_wide_window_only_skips_and_does_not_return_candidates() { + let total = 4 * BLOCK_SIZE as u32; + let mut docs = DocSet::default(); + for i in 0..total { + docs.append(i as u64, 1); + } + + let lead_docs = (0..total).step_by(2).collect::>(); + let follower_docs = (0..total).collect::>(); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("lead"), + 0, + 0, + 1.0, + generate_posting_list(lead_docs, 3.0, Some(vec![1.0, 3.0]), true), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("follower"), + 1, + 1, + 1.0, + generate_posting_list(follower_docs, 3.0, Some(vec![0.1, 0.1, 3.0, 3.0]), true), + docs.len(), + ), + ]; + + let mut wand = Wand::new(Operator::And, postings.into_iter(), &docs, UnitScorer); + wand.threshold = 2.0; + + let candidate = wand.next().unwrap().unwrap(); + + assert_eq!(candidate.0.doc_id(), (2 * BLOCK_SIZE) as u64); + assert_eq!(wand.up_to, Some((3 * BLOCK_SIZE - 1) as u64)); + assert_eq!(wand.and_window_stats.windows_wide, 1); + assert_eq!(wand.and_window_stats.windows_skipped, 1); + assert_eq!(wand.and_window_stats.windows_narrow, 1); + assert_eq!(wand.and_window_stats.candidates_returned, 1); + } + + #[test] + fn test_and_range_max_preserves_exact_top_k() { + let total = 4 * BLOCK_SIZE as u32; + let hot = BLOCK_SIZE as u32..BLOCK_SIZE as u32 + 16; + let mut docs = DocSet::default(); + for doc_id in 0..total { + let doc_tokens = if hot.contains(&doc_id) { 1 } else { 1000 }; + docs.append(doc_id as u64, doc_tokens); + } + + let params = FtsSearchParams::new().with_limit(Some(8)); + let run = |is_compressed: bool| { + let lead_docs = (0..total).step_by(2).collect::>(); + let follower_docs = (0..total).collect::>(); + let lead_scores = is_compressed.then_some(vec![1.0, 0.001]); + let follower_scores = is_compressed.then_some(vec![0.001, 1.0, 0.001, 0.001]); + let postings = vec![ + PostingIterator::with_query_weight( + String::from("lead"), + 0, + 0, + 1.0, + generate_posting_list(lead_docs, 1.0, lead_scores, is_compressed), + docs.len(), + ), + PostingIterator::with_query_weight( + String::from("follower"), + 1, + 1, + 1.0, + generate_posting_list(follower_docs, 1.0, follower_scores, is_compressed), + docs.len(), + ), + ]; + let mut wand = Wand::new( + Operator::And, + postings.into_iter(), + &docs, + InverseDocLengthScorer, + ); + sorted_candidate_row_ids( + wand.search( + ¶ms, + Arc::new(RowAddrMask::default()), + &NoOpMetricsCollector, + ) + .unwrap(), + ) + }; + + let compressed = run(true); + let plain = run(false); + let expected = hot.step_by(2).map(u64::from).collect::>(); + assert_eq!(compressed, expected); + assert_eq!(compressed, plain); + } + + #[test] + fn test_block_max_score_up_to_slides_and_expires_old_max() { + let total = 5 * BLOCK_SIZE as u32; + let posting = generate_posting_list( + (0..total).collect(), + 5.0, + Some(vec![1.0, 4.0, 2.0, 5.0, 3.0]), + true, + ); + let mut posting = PostingIterator::new(String::from("term"), 0, 0, posting, total as usize); + + posting.shallow_next(0); + assert_eq!( + posting + .block_max_score_up_to_with_stats((3 * BLOCK_SIZE - 1) as u64) + .score, + 4.0 + ); + + posting.shallow_next((2 * BLOCK_SIZE) as u64); + assert_eq!( + posting + .block_max_score_up_to_with_stats((4 * BLOCK_SIZE - 1) as u64) + .score, + 5.0 + ); + + posting.shallow_next((4 * BLOCK_SIZE) as u64); + assert_eq!( + posting + .block_max_score_up_to_with_stats((5 * BLOCK_SIZE - 1) as u64) + .score, + 3.0 + ); + } + #[test] fn test_and_candidate_prune_scores_first_term_before_full_score() { let total_docs = 2 * BLOCK_SIZE as u32 + 1; From aa544a853b85936554d1615a25b37f08ce50d682 Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Tue, 23 Jun 2026 11:32:09 +0800 Subject: [PATCH 168/177] perf(index): improve FTS search metadata caching (#7398) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR improves FTS search throughput by avoiding repeated metadata reads on hot search paths: - caches immutable corpus-level BM25 stats on the loaded `InvertedIndex` - caches per-token posting metadata (`max_score`, posting length) in the existing partition-prefixed Lance cache - keeps token sets resident behavior unchanged and does not cache posting list bodies The main target is global QPS under concurrent full-text search, especially when the index is stored on object storage. ## S3 Performance Benchmark shape for both datasets: - query set: `the`, `data`, `learning`, `world`, `machine learning`, `artificial intelligence`, `中国`, `人工智能` - `limit=10`, projected columns: `_rowid`, `_score` - warmup: query set x3 - each concurrency point runs for 20s - baseline and patched results returned identical row ids and 6-decimal scores for the query set ### 1M S3 Dataset Dataset: `s3://xuanwo-fts-bench-use1/datasets/mmlb_1m_all_columns_no_image_en_zh_icu_bench_icu-1m-perf-opt-20260619T143109Z.lance` | concurrency | baseline QPS | patched QPS | QPS delta | baseline p95 | patched p95 | |---:|---:|---:|---:|---:|---:| | 1 | 7.73 | 13.43 | +73.7% | 202.45ms | 108.64ms | | 2 | 15.51 | 24.49 | +57.9% | 210.17ms | 122.68ms | | 4 | 34.45 | 53.01 | +53.9% | 184.70ms | 125.08ms | | 8 | 71.74 | 96.25 | +34.2% | 171.57ms | 129.44ms | | 16 | 120.33 | 199.30 | +65.6% | 226.07ms | 125.03ms | | 32 | 214.90 | 242.96 | +13.1% | 279.15ms | 283.01ms | The 32-concurrency point is saturated/noisy; the improvement is stable at 1-16 concurrency. ### 10M S3 Dataset Dataset: `s3://xuanwo-fts-bench-use1/datasets/mmlb_10m_full_content_icu_s3_search_20260623T000000Z.lance` - 10,000,000 rows, 10 fragments - 19 S3 objects, 69,994,744,162 bytes total - FTS index size: 7.76 GiB | concurrency | baseline QPS | patched QPS | QPS delta | baseline p95 | patched p95 | |---:|---:|---:|---:|---:|---:| | 1 | 7.40 | 11.55 | +56.1% | 290.69ms | 152.26ms | | 2 | 14.45 | 22.90 | +58.5% | 236.91ms | 132.04ms | | 4 | 35.20 | 44.90 | +27.6% | 175.41ms | 145.16ms | | 8 | 64.10 | 86.55 | +35.0% | 198.30ms | 142.27ms | | 16 | 132.55 | 160.40 | +21.0% | 185.01ms | 163.47ms | | 32 | 211.95 | 235.70 | +11.2% | 283.94ms | 305.04ms | The 10M S3 result confirms the object-store improvement at larger index scale. The 32-concurrency point remains saturated/noisy and has a p95 regression despite higher QPS. ## Validation - `cargo fmt --all` - `git diff --check` - `cargo test -p lance-index scalar::inverted::index::tests::` - `cargo clippy --all --tests --benches -- -D warnings` Co-authored-by: LuQQiu --- rust/lance-index/src/scalar/inverted/index.rs | 154 +++++++++++++----- 1 file changed, 110 insertions(+), 44 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index ac13fc0c585..c23dc1c4e78 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -49,7 +49,7 @@ use lance_core::{Error, ROW_ID, ROW_ID_FIELD, Result}; use lance_select::{RowAddrMask, RowAddrTreeMap}; use roaring::RoaringBitmap; use std::sync::LazyLock; -use tokio::task::spawn_blocking; +use tokio::{sync::OnceCell, task::spawn_blocking}; use tracing::{info, instrument}; use super::encoding::{PositionBlockBuilder, decode_group_starts}; @@ -419,6 +419,7 @@ pub struct InvertedIndex { tokenizer: Box, token_set_format: TokenSetFormat, pub(crate) partitions: Vec>, + corpus_stats: Arc>, // Fragments which are contained in the index, but no longer in the dataset. // These should be pruned at search time since we don't prune them at update time. deleted_fragments: RoaringBitmap, @@ -666,21 +667,26 @@ impl InvertedIndex { /// `LazyDocSet`. Avoids materializing the full DocSet just to get /// these two scalars. async fn aggregate_corpus_stats(&self) -> Result<(u64, usize)> { - let io_parallelism = self.store.io_parallelism(); - let num_docs: usize = self.partitions.iter().map(|p| p.docs.len()).sum(); - let futures = self - .partitions - .iter() - .map(|p| { - let docs = p.docs.clone(); - async move { docs.total_tokens_num().await } + self.corpus_stats + .get_or_try_init(|| async { + let io_parallelism = self.store.io_parallelism(); + let num_docs: usize = self.partitions.iter().map(|p| p.docs.len()).sum(); + let futures = self + .partitions + .iter() + .map(|p| { + let docs = p.docs.clone(); + async move { docs.total_tokens_num().await } + }) + .collect::>(); + let totals: Vec = stream::iter(futures) + .buffer_unordered(io_parallelism) + .try_collect() + .await?; + Ok((totals.into_iter().sum(), num_docs)) }) - .collect::>(); - let totals: Vec = stream::iter(futures) - .buffer_unordered(io_parallelism) - .try_collect() - .await?; - Ok((totals.into_iter().sum(), num_docs)) + .await + .copied() } /// Sum the posting-list length for `term` across this index's partitions @@ -997,6 +1003,7 @@ impl InvertedIndex { docs: Arc::new(LazyDocSet::from_loaded(docs)), token_set_format: TokenSetFormat::Arrow, })], + corpus_stats: Arc::new(OnceCell::new()), deleted_fragments: RoaringBitmap::new(), })) } @@ -1084,6 +1091,7 @@ impl InvertedIndex { tokenizer, token_set_format, partitions, + corpus_stats: Arc::new(OnceCell::new()), deleted_fragments, })) } @@ -2193,7 +2201,7 @@ enum PostingMetadata { /// `ensure_metadata_loaded`, and the stats path can also fetch a single /// token via `posting_len_for_token` without forcing the bulk load. V2 { - metadata: tokio::sync::OnceCell, + metadata: OnceCell, }, } @@ -2272,7 +2280,7 @@ impl PostingListReader { } } else { PostingMetadata::V2 { - metadata: tokio::sync::OnceCell::new(), + metadata: OnceCell::new(), } }; @@ -2373,10 +2381,10 @@ impl PostingListReader { } /// Async access to a single token's posting list length. For v2 - /// indexes this reads a single row from `LENGTH_COL` if the bulk metadata - /// has not been loaded yet, and never triggers the bulk load itself. The - /// stats path uses this so a single-term `df` lookup costs O(1) bytes - /// rather than O(num_unique_tokens). + /// indexes this reads one row of posting metadata if the bulk metadata has + /// not been loaded yet, and never triggers the bulk load itself. The stats + /// path uses this so a single-term `df` lookup costs O(1) bytes rather + /// than O(num_unique_tokens). pub(crate) async fn posting_len_for_token(&self, token_id: u32) -> Result { match &self.metadata { PostingMetadata::LegacyV1 { .. } => Ok(self.posting_len(token_id)), @@ -2384,13 +2392,10 @@ impl PostingListReader { if let Some(metadata) = metadata.get() { return Ok(metadata.lengths[token_id as usize] as usize); } - let token_id = token_id as usize; - let batch = self - .reader - .read_range(token_id..token_id + 1, Some(&[LENGTH_COL])) - .await?; - let len = batch[LENGTH_COL].as_primitive::().value(0); - Ok(len as usize) + let (_, length) = self.posting_metadata_for_token(token_id).await?; + length + .map(|len| len as usize) + .ok_or_else(|| Error::index("posting length metadata missing".to_string())) } } } @@ -2416,17 +2421,20 @@ impl PostingListReader { Some(loaded.lengths[token_id as usize]), )); } - let token_id_usize = token_id as usize; - let batch = self - .reader - .read_range( - token_id_usize..token_id_usize + 1, - Some(&[MAX_SCORE_COL, LENGTH_COL]), - ) + let metadata = self + .index_cache + .get_or_insert_with_key(PostingMetadataKey { token_id }, || async move { + let token_id = token_id as usize; + let batch = self + .reader + .read_range(token_id..token_id + 1, Some(&[MAX_SCORE_COL, LENGTH_COL])) + .await?; + let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); + let length = batch[LENGTH_COL].as_primitive::().value(0); + Ok(PostingMetadataValue { max_score, length }) + }) .await?; - let max_score = batch[MAX_SCORE_COL].as_primitive::().value(0); - let length = batch[LENGTH_COL].as_primitive::().value(0); - Ok((Some(max_score), Some(length))) + Ok((Some(metadata.max_score), Some(metadata.length))) } } } @@ -3241,6 +3249,29 @@ impl CacheKey for PostingListGroupKey { } } +#[derive(Debug, Clone, DeepSizeOf)] +struct PostingMetadataValue { + max_score: f32, + length: u32, +} + +#[derive(Debug, Clone)] +struct PostingMetadataKey { + token_id: u32, +} + +impl CacheKey for PostingMetadataKey { + type ValueType = PostingMetadataValue; + + fn key(&self) -> std::borrow::Cow<'_, str> { + format!("posting-metadata-{}", self.token_id).into() + } + + fn type_name() -> &'static str { + "PostingMetadata" + } +} + #[derive(Debug, Clone)] pub struct PositionKey { pub token_id: u32, @@ -6845,6 +6876,7 @@ mod tests { // when the test exercises a scoring path. async fn load_counted_v2_index( num_tokens: usize, + cache: LanceCache, ) -> (Arc, Arc, TempObjDir) { let tmpdir = TempObjDir::default(); let inner_store = Arc::new(LanceIndexStore::new( @@ -6889,7 +6921,7 @@ mod tests { posting_file: posting_file_path(0), counter: counter.clone(), }); - let index = InvertedIndex::load(counting_store, None, &LanceCache::no_cache()) + let index = InvertedIndex::load(counting_store, None, &cache) .await .unwrap(); (index, counter, tmpdir) @@ -6902,9 +6934,9 @@ mod tests { /// /// * `InvertedIndex::load` does not touch the posting file at all /// (`InvertedPartition::load` only needs the token file and docs file). - /// * `bm25_stats_for_terms(["t0"])` reads exactly one row from the - /// posting file (the single LENGTH_COL entry for token 0) regardless - /// of how many unique tokens the partition has. + /// * `bm25_stats_for_terms(["t0"])` reads exactly one metadata row from + /// the posting file for token 0 regardless of how many unique tokens the + /// partition has. /// /// Before this refactor, `PostingListReader::try_new` did /// `read_range(0..num_rows, [MAX_SCORE_COL, LENGTH_COL])`, so the @@ -6917,7 +6949,8 @@ mod tests { #[case::tokens_1000(1000)] #[tokio::test] async fn test_bm25_stats_for_terms_is_lazy(#[case] num_tokens: usize) { - let (index, counter, _tmpdir) = load_counted_v2_index(num_tokens).await; + let (index, counter, _tmpdir) = + load_counted_v2_index(num_tokens, LanceCache::no_cache()).await; assert!( !index.partitions[0].inverted_list.is_legacy_layout(), "this test only proves the lazy path for v2 indexes", @@ -6957,6 +6990,38 @@ mod tests { ); } + #[tokio::test] + async fn test_bm25_stats_for_terms_reuses_posting_metadata_cache() { + let cache = LanceCache::with_capacity(1024 * 1024); + let (index, counter, _tmpdir) = load_counted_v2_index(100, cache.clone()).await; + + let terms = ["t0".to_string()]; + let first = index.bm25_stats_for_terms(&terms).await.unwrap(); + assert_eq!(first, (100, 100, vec![1])); + assert_eq!(counter.metadata_rows_read(), 1); + + let second = index.bm25_stats_for_terms(&terms).await.unwrap(); + assert_eq!(second, first); + assert_eq!( + counter.metadata_rows_read(), + 1, + "repeated stats for the same token should reuse cached posting metadata", + ); + } + + #[tokio::test] + async fn test_aggregate_corpus_stats_reuses_cached_value() { + let (index, _counter, _tmpdir) = load_counted_v2_index(100, LanceCache::no_cache()).await; + assert!(index.corpus_stats.get().is_none()); + + let first = index.aggregate_corpus_stats().await.unwrap(); + assert_eq!(first, (100, 100)); + assert_eq!(index.corpus_stats.get().copied(), Some(first)); + + let second = index.aggregate_corpus_stats().await.unwrap(); + assert_eq!(second, first); + } + #[tokio::test] async fn test_grouped_posting_lists_read_one_group_per_neighborhood() { // Cold-start scoring must not bulk-read the full `0..num_tokens` @@ -6966,7 +7031,8 @@ mod tests { // total token count. let num_tokens = 500; let queried_tokens: [u32; 4] = [0, 1, 2, 3]; - let (index, counter, _tmpdir) = load_counted_v2_index(num_tokens).await; + let (index, counter, _tmpdir) = + load_counted_v2_index(num_tokens, LanceCache::no_cache()).await; let inverted_list = index.partitions[0].inverted_list.clone(); assert!( !inverted_list.is_legacy_layout(), From 23211989de648fefc4454f5eee09ec176f0a465b Mon Sep 17 00:00:00 2001 From: Lance Release Bot Date: Tue, 23 Jun 2026 03:33:06 +0000 Subject: [PATCH 169/177] chore: release beta version 9.0.0-beta.2 --- .bumpversion.toml | 2 +- Cargo.lock | 48 +++++++++++++++++++-------------------- Cargo.toml | 44 +++++++++++++++++------------------ java/lance-jni/Cargo.lock | 40 ++++++++++++++++---------------- java/lance-jni/Cargo.toml | 2 +- java/pom.xml | 2 +- python/Cargo.lock | 40 ++++++++++++++++---------------- python/Cargo.toml | 2 +- 8 files changed, 90 insertions(+), 90 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index 7c9e5196b3f..012ea06e89f 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "9.0.0-beta.1" +current_version = "9.0.0-beta.2" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)(-(?P(beta|rc))\\.(?P\\d+))?" serialize = [ "{major}.{minor}.{patch}-{prerelease}.{prerelease_num}", diff --git a/Cargo.lock b/Cargo.lock index 325c8dca20c..60899b5c69c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3076,7 +3076,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4380,7 +4380,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "all_asserts", "approx", @@ -4483,7 +4483,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4531,7 +4531,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4540,7 +4540,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4580,7 +4580,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4613,7 +4613,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4632,7 +4632,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "proc-macro2", "quote", @@ -4641,7 +4641,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4686,7 +4686,7 @@ dependencies = [ [[package]] name = "lance-examples" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "all_asserts", "arrow", @@ -4712,7 +4712,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "approx", "arc-swap", @@ -4842,7 +4842,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4890,7 +4890,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "approx", "arrow-array", @@ -4910,7 +4910,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4922,7 +4922,7 @@ dependencies = [ [[package]] name = "lance-namespace-datafusion" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -4938,7 +4938,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -5002,7 +5002,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -5020,7 +5020,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -5066,7 +5066,7 @@ dependencies = [ [[package]] name = "lance-test-macros" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "proc-macro2", "quote", @@ -5075,7 +5075,7 @@ dependencies = [ [[package]] name = "lance-testing" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-schema", @@ -5088,7 +5088,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", @@ -5101,7 +5101,7 @@ dependencies = [ [[package]] name = "lance-tools" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "clap", "lance-core", diff --git a/Cargo.toml b/Cargo.toml index 508c950fc6e..a457d35ea7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ resolver = "3" [workspace.package] -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" @@ -57,27 +57,27 @@ rust-version = "1.91.0" [workspace.dependencies] arc-swap = "1.7" libc = "0.2.176" -lance = { version = "=9.0.0-beta.1", path = "./rust/lance", default-features = false } -lance-arrow = { version = "=9.0.0-beta.1", path = "./rust/lance-arrow" } -lance-core = { version = "=9.0.0-beta.1", path = "./rust/lance-core" } -lance-datafusion = { version = "=9.0.0-beta.1", path = "./rust/lance-datafusion" } -lance-datagen = { version = "=9.0.0-beta.1", path = "./rust/lance-datagen" } -lance-derive = { version = "=9.0.0-beta.1", path = "./rust/lance-derive" } -lance-encoding = { version = "=9.0.0-beta.1", path = "./rust/lance-encoding" } -lance-file = { version = "=9.0.0-beta.1", path = "./rust/lance-file" } -lance-geo = { version = "=9.0.0-beta.1", path = "./rust/lance-geo" } -lance-index = { version = "=9.0.0-beta.1", path = "./rust/lance-index" } -lance-io = { version = "=9.0.0-beta.1", path = "./rust/lance-io", default-features = false } -lance-linalg = { version = "=9.0.0-beta.1", path = "./rust/lance-linalg" } -lance-namespace = { version = "=9.0.0-beta.1", path = "./rust/lance-namespace" } -lance-namespace-impls = { version = "=9.0.0-beta.1", path = "./rust/lance-namespace-impls" } +lance = { version = "=9.0.0-beta.2", path = "./rust/lance", default-features = false } +lance-arrow = { version = "=9.0.0-beta.2", path = "./rust/lance-arrow" } +lance-core = { version = "=9.0.0-beta.2", path = "./rust/lance-core" } +lance-datafusion = { version = "=9.0.0-beta.2", path = "./rust/lance-datafusion" } +lance-datagen = { version = "=9.0.0-beta.2", path = "./rust/lance-datagen" } +lance-derive = { version = "=9.0.0-beta.2", path = "./rust/lance-derive" } +lance-encoding = { version = "=9.0.0-beta.2", path = "./rust/lance-encoding" } +lance-file = { version = "=9.0.0-beta.2", path = "./rust/lance-file" } +lance-geo = { version = "=9.0.0-beta.2", path = "./rust/lance-geo" } +lance-index = { version = "=9.0.0-beta.2", path = "./rust/lance-index" } +lance-io = { version = "=9.0.0-beta.2", path = "./rust/lance-io", default-features = false } +lance-linalg = { version = "=9.0.0-beta.2", path = "./rust/lance-linalg" } +lance-namespace = { version = "=9.0.0-beta.2", path = "./rust/lance-namespace" } +lance-namespace-impls = { version = "=9.0.0-beta.2", path = "./rust/lance-namespace-impls" } lance-namespace-datafusion = { version = "=7.0.0-beta.9", path = "./rust/lance-namespace-datafusion" } lance-namespace-reqwest-client = "0.8.6" -lance-select = { version = "=9.0.0-beta.1", path = "./rust/lance-select" } -lance-tokenizer = { version = "=9.0.0-beta.1", path = "./rust/lance-tokenizer" } -lance-table = { version = "=9.0.0-beta.1", path = "./rust/lance-table" } -lance-test-macros = { version = "=9.0.0-beta.1", path = "./rust/lance-test-macros" } -lance-testing = { version = "=9.0.0-beta.1", path = "./rust/lance-testing" } +lance-select = { version = "=9.0.0-beta.2", path = "./rust/lance-select" } +lance-tokenizer = { version = "=9.0.0-beta.2", path = "./rust/lance-tokenizer" } +lance-table = { version = "=9.0.0-beta.2", path = "./rust/lance-table" } +lance-test-macros = { version = "=9.0.0-beta.2", path = "./rust/lance-test-macros" } +lance-testing = { version = "=9.0.0-beta.2", path = "./rust/lance-testing" } approx = "0.5.1" # Note that this one does not include pyarrow arrow = { version = "58.0.0", optional = false, features = ["prettyprint"] } @@ -104,7 +104,7 @@ half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", ] } -lance-bitpacking = { version = "=9.0.0-beta.1", path = "./rust/compression/bitpacking" } +lance-bitpacking = { version = "=9.0.0-beta.2", path = "./rust/compression/bitpacking" } bitpacking = "0.9" bitvec = "1" bytes = "1.11.1" @@ -143,7 +143,7 @@ datafusion-substrait = { version = "53.0.0", default-features = false } dirs = "6.0.0" either = "1.0" fst = { version = "0.4.7", features = ["levenshtein"] } -fsst = { version = "=9.0.0-beta.1", path = "./rust/compression/fsst" } +fsst = { version = "=9.0.0-beta.2", path = "./rust/compression/fsst" } futures = "0.3" geoarrow-array = "0.8" geoarrow-schema = "0.8" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 2827ef9b19a..d8131377563 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2479,7 +2479,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -3665,7 +3665,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -3738,7 +3738,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -3780,7 +3780,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -3789,7 +3789,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -3827,7 +3827,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3859,7 +3859,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -3876,7 +3876,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "proc-macro2", "quote", @@ -3885,7 +3885,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3920,7 +3920,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -3950,7 +3950,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -3964,7 +3964,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4032,7 +4032,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4073,7 +4073,7 @@ dependencies = [ [[package]] name = "lance-jni" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4109,7 +4109,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4125,7 +4125,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4137,7 +4137,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4186,7 +4186,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4201,7 +4201,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "rust-stemmers", diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index db37d1d31a7..3626d7aad3e 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lance-jni" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] rust-version = "1.91" diff --git a/java/pom.xml b/java/pom.xml index 9b4e57e4724..15d05b95d68 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ org.lance lance-core Lance Core - 9.0.0-beta.1 + 9.0.0-beta.2 jar Lance Format Java API diff --git a/python/Cargo.lock b/python/Cargo.lock index b1b207e83d7..955f35f97eb 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2859,7 +2859,7 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "fsst" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "rand 0.9.4", @@ -4067,7 +4067,7 @@ checksum = "e037a2e1d8d5fdbd49b16a4ea09d5d6401c1f29eca5ff29d03d3824dba16256a" [[package]] name = "lance" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4141,7 +4141,7 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4183,7 +4183,7 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrayref", "paste", @@ -4192,7 +4192,7 @@ dependencies = [ [[package]] name = "lance-core" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4230,7 +4230,7 @@ dependencies = [ [[package]] name = "lance-datafusion" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4262,7 +4262,7 @@ dependencies = [ [[package]] name = "lance-datagen" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4279,7 +4279,7 @@ dependencies = [ [[package]] name = "lance-derive" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "proc-macro2", "quote", @@ -4288,7 +4288,7 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "lance-file" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-arith", "arrow-array", @@ -4353,7 +4353,7 @@ dependencies = [ [[package]] name = "lance-geo" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "datafusion", "geo-traits", @@ -4367,7 +4367,7 @@ dependencies = [ [[package]] name = "lance-index" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arc-swap", "arrow", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "lance-io" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-arith", @@ -4477,7 +4477,7 @@ dependencies = [ [[package]] name = "lance-linalg" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4493,7 +4493,7 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "async-trait", @@ -4505,7 +4505,7 @@ dependencies = [ [[package]] name = "lance-namespace-impls" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-ipc", @@ -4554,7 +4554,7 @@ dependencies = [ [[package]] name = "lance-select" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow-array", "arrow-buffer", @@ -4569,7 +4569,7 @@ dependencies = [ [[package]] name = "lance-table" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "arrow", "arrow-array", @@ -4608,7 +4608,7 @@ dependencies = [ [[package]] name = "lance-tokenizer" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "icu_segmenter", "jieba-rs", @@ -6046,7 +6046,7 @@ dependencies = [ [[package]] name = "pylance" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" dependencies = [ "alloc-stdlib", "arrow", diff --git a/python/Cargo.toml b/python/Cargo.toml index 89a50a652dc..e76137fc63c 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pylance" -version = "9.0.0-beta.1" +version = "9.0.0-beta.2" edition = "2024" authors = ["Lance Devs "] license = "Apache-2.0" From 7ebc22da12495d67ff6125e219dcd6510d84d170 Mon Sep 17 00:00:00 2001 From: Yang Cen Date: Tue, 23 Jun 2026 13:46:36 +0800 Subject: [PATCH 170/177] perf(fts): open fts segments as scalar indices (#7408) --- rust/lance/src/io/exec/fts.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index c087d3289c0..add864c0ea9 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -65,7 +65,7 @@ async fn open_fts_segment( metrics: &IndexMetrics, ) -> Result> { let index = dataset - .open_generic_index(column, &segment.uuid, metrics) + .open_scalar_index(column, &segment.uuid, metrics) .await?; let inverted = index .as_any() From ed9caf34fa6ebe19aaefacb0c3db9ad3bab435b1 Mon Sep 17 00:00:00 2001 From: Dan Rammer Date: Tue, 23 Jun 2026 01:01:59 -0500 Subject: [PATCH 171/177] feat(mem-wal): add ShardWriter::put_no_wait (#7362) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Adds `ShardWriter::put_no_wait`, a variant of `put` that performs the visible in-memory insert and triggers the durable WAL flush, then returns the `BatchDurableWatcher` **without** awaiting it. A thin wrapper restores `put`'s behavior (await the watcher), so existing callers are unchanged. `put_memtable` is split into: - `put_memtable_no_wait` — the in-memory critical section (insert under `state_lock` + `track_batch_for_wal` + flush triggers) followed by `trigger_flush`, returning `(WriteResult, Option)`. - `put_memtable` — calls the above, then `watcher.wait()`. `BatchDurableWatcher` and `WriteResult` are re-exported from `mem_wal`. ## Why Lets an external caller hold its own serialization lock across only the in-memory read-merge-insert critical section and await durability **after** releasing it, so concurrent durable flushes still coalesce. The in-memory insert stays guarded by the writer's `state_lock`, so `BatchStore`'s single-writer invariant holds regardless of the external lock — `state_lock` is intentionally **not** skipped. This is the lance-side primitive for sophon's WAL partial-column-update path (read fresh tier → merge → insert under a per-bucket lock, durability awaited outside it). ## Tests `test_put_no_wait_durable_visible_then_durable` (row visible before durability, watcher resolves) and `test_put_no_wait_non_durable_returns_no_watcher`. --------- Co-authored-by: Lance Release Bot Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance/src/dataset/mem_wal.rs | 3 +- rust/lance/src/dataset/mem_wal/write.rs | 161 +++++++++++++++++++++--- 2 files changed, 148 insertions(+), 16 deletions(-) diff --git a/rust/lance/src/dataset/mem_wal.rs b/rust/lance/src/dataset/mem_wal.rs index 5f3bc2ed483..7eaa8ffb83f 100644 --- a/rust/lance/src/dataset/mem_wal.rs +++ b/rust/lance/src/dataset/mem_wal.rs @@ -51,6 +51,7 @@ pub use sharding::{ evaluate_sharding_spec, evaluate_sharding_spec_with_embedded_columns, evaluate_sharding_spec_with_source_columns, }; -pub use wal::{WalAppendResult, WalAppender, WalReadEntry, WalTailer}; +pub use wal::{BatchDurableWatcher, WalAppendResult, WalAppender, WalReadEntry, WalTailer}; pub use write::ShardWriter; pub use write::ShardWriterConfig; +pub use write::WriteResult; diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 491bb68aec5..5faa65d8e7d 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -49,7 +49,8 @@ pub use super::wal::{WalEntry, WalEntryData, WalFlushResult, WalFlusher}; use super::memtable::flush::TriggerMemTableFlush; use super::scanner::GenerationWarmer; use super::wal::{ - TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, empty_flush_result, + BatchDurableWatcher, TriggerWalFlush, WalAppender, WalFlushSource, WalOnlyState, WalTailer, + empty_flush_result, }; use super::manifest::ShardManifestStore; @@ -1528,14 +1529,7 @@ impl ShardWriter { /// `AlreadyExists`, indicating this writer has been fenced. #[instrument(name = "sw_put", level = "info", skip_all, fields(batch_count = batches.len(), shard_id = %self.config.shard_id))] pub async fn put(&self, batches: Vec) -> Result { - if batches.is_empty() { - return Err(Error::invalid_input("Cannot write empty batch list")); - } - for (i, batch) in batches.iter().enumerate() { - if batch.num_rows() == 0 { - return Err(Error::invalid_input(format!("Batch {} is empty", i))); - } - } + Self::validate_non_empty(&batches)?; match &self.mode { WriterMode::MemTable { @@ -1558,6 +1552,51 @@ impl ShardWriter { } } + /// Like [`Self::put`], but returns the durability watcher *without* awaiting + /// it. The row is visible to reads on this writer the instant this returns; + /// the caller awaits durability via the watcher (`None` when `durable_write` + /// is off). + /// + /// This lets a caller hold an *external* lock across only the in-memory + /// read-merge-insert and await durability after releasing it, so concurrent + /// flushes still coalesce. The insert stays guarded by the internal + /// `state_lock`, so `BatchStore`'s single-writer invariant holds regardless. + /// + /// MemTable mode only; errors in WAL-only mode (no in-memory tier). + #[instrument(name = "sw_put_no_wait", level = "info", skip_all, fields(batch_count = batches.len(), shard_id = %self.config.shard_id))] + pub async fn put_no_wait( + &self, + batches: Vec, + ) -> Result<(WriteResult, Option)> { + Self::validate_non_empty(&batches)?; + + match &self.mode { + WriterMode::MemTable { + state, + writer_state, + backpressure, + } => { + self.put_memtable_no_wait(batches, state, writer_state, backpressure) + .await + } + WriterMode::WalOnly { .. } => Err(Error::invalid_input( + "put_no_wait is only supported in MemTable mode", + )), + } + } + + fn validate_non_empty(batches: &[RecordBatch]) -> Result<()> { + if batches.is_empty() { + return Err(Error::invalid_input("Cannot write empty batch list")); + } + for (i, batch) in batches.iter().enumerate() { + if batch.num_rows() == 0 { + return Err(Error::invalid_input(format!("Batch {} is empty", i))); + } + } + Ok(()) + } + async fn put_memtable( &self, batches: Vec, @@ -1565,6 +1604,26 @@ impl ShardWriter { writer_state: &Arc, backpressure: &BackpressureController, ) -> Result { + let (result, watcher) = self + .put_memtable_no_wait(batches, state_lock, writer_state, backpressure) + .await?; + // Wait for durability if configured (outside the lock). + if let Some(mut watcher) = watcher { + watcher.wait().await?; + } + Ok(result) + } + + /// In-memory half of [`Self::put_memtable`]: insert under `state_lock`, + /// trigger the WAL flush, and return the watcher un-awaited for the caller + /// to wait on. `None` when `durable_write` is off. See [`Self::put_no_wait`]. + async fn put_memtable_no_wait( + &self, + batches: Vec, + state_lock: &Arc>, + writer_state: &Arc, + backpressure: &BackpressureController, + ) -> Result<(WriteResult, Option)> { // Apply backpressure if needed (before acquiring main lock) backpressure .maybe_apply_backpressure(|| { @@ -1578,7 +1637,7 @@ impl ShardWriter { let start = std::time::Instant::now(); // Acquire write lock for entire operation (atomic approach) - let (batch_positions, mut durable_watcher, batch_store, indexes) = { + let (batch_positions, durable_watcher, batch_store, indexes) = { let mut state = state_lock.write().await; // 1. Insert all batches into memtable atomically @@ -1609,8 +1668,9 @@ impl ShardWriter { self.stats.record_put(start.elapsed()); - // Wait for durability if configured (outside the lock) - if self.config.durable_write { + // Trigger the flush here (outside the lock) so the watcher can resolve; + // only the `wait()` is the caller's to schedule. + let watcher = if self.config.durable_write { self.wal_flusher.trigger_flush( WalFlushSource::BatchStore { batch_store, @@ -1619,10 +1679,12 @@ impl ShardWriter { batch_positions.end, None, )?; - durable_watcher.wait().await?; - } + Some(durable_watcher) + } else { + None + }; - Ok(WriteResult { batch_positions }) + Ok((WriteResult { batch_positions }, watcher)) } async fn put_wal_only( @@ -2814,6 +2876,75 @@ mod tests { writer.close().await.unwrap(); } + #[tokio::test] + async fn test_put_no_wait_durable_visible_then_durable() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: true, + sync_indexed_write: false, + max_wal_buffer_size: 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + ..Default::default() + }; + + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let batch = create_test_batch(&schema, 0, 10); + let (result, watcher) = writer.put_no_wait(vec![batch]).await.unwrap(); + assert_eq!(result.batch_positions, 0..1); + + // Row is visible in memory before durability is awaited. + let stats = writer.memtable_stats().await.unwrap(); + assert_eq!(stats.row_count, 10); + + // durable_write is on, so a watcher is returned and resolves once the + // triggered flush lands. + let mut watcher = watcher.expect("durable_write returns a watcher"); + watcher.wait().await.unwrap(); + + writer.close().await.unwrap(); + } + + #[tokio::test] + async fn test_put_no_wait_non_durable_returns_no_watcher() { + let (store, base_path, base_uri, _temp_dir) = create_local_store().await; + let schema = create_test_schema(); + + let config = ShardWriterConfig { + shard_id: Uuid::new_v4(), + shard_spec_id: 0, + durable_write: false, + sync_indexed_write: false, + max_wal_buffer_size: 1024 * 1024, + max_wal_flush_interval: None, + max_memtable_size: 64 * 1024 * 1024, + manifest_scan_batch_size: 2, + ..Default::default() + }; + + let writer = ShardWriter::open(store, base_path, base_uri, config, schema.clone(), vec![]) + .await + .unwrap(); + + let batch = create_test_batch(&schema, 0, 10); + let (result, watcher) = writer.put_no_wait(vec![batch]).await.unwrap(); + assert_eq!(result.batch_positions, 0..1); + assert!(watcher.is_none(), "non-durable put has nothing to await"); + + let stats = writer.memtable_stats().await.unwrap(); + assert_eq!(stats.row_count, 10); + + writer.close().await.unwrap(); + } + #[tokio::test] async fn test_shard_writer_multiple_writes() { let (store, base_path, base_uri, _temp_dir) = create_local_store().await; From df08f7f020f2e60ddb0bc1c2805b3f02dc3e6def Mon Sep 17 00:00:00 2001 From: YangJie Date: Tue, 23 Jun 2026 22:24:28 +0800 Subject: [PATCH 172/177] fix(io): clamp ObjectStore::io_parallelism() to at least 1 (#7414) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `ObjectStore::io_parallelism()` returns the `LANCE_IO_THREADS` override verbatim when that env var is set, so `LANCE_IO_THREADS=0` yields `0`. Callers feed this value straight into `buffered` / `buffer_unordered`, and a window of **0** makes those streams **never poll their input** — so a plain metadata-only `count_rows` (and ~8 other fan-out sites in `dataset.rs`) would hang instead of returning. ```diff pub fn io_parallelism(&self) -> usize { std::env::var("LANCE_IO_THREADS") .map(|val| val.parse::().unwrap()) .unwrap_or(self.io_parallelism) + .max(1) } ``` Clamping at the source covers every caller in one place. The store's configured default is already `>= 1`, so this only changes the explicit `LANCE_IO_THREADS=0` case. ## Context Follow-up to #7076, where the same hang was fixed locally in the count path (per review). This addresses the root cause so the other unguarded `io_parallelism()` → `buffered`/`buffer_unordered` sites are covered too. ## Test `test_io_parallelism_clamped_to_nonzero` asserts `LANCE_IO_THREADS=0` clamps to `1`, a positive override (`8`) passes through unchanged, and the default is `>= 1`. --- rust/lance-io/src/object_store.rs | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 0c44095f117..1761dc4b059 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -578,10 +578,17 @@ impl ObjectStore { self.max_iop_size } + /// The amount of parallelism to use for I/O operations. + /// + /// Honors the `LANCE_IO_THREADS` override when set, otherwise the store's configured value. + /// Always at least 1: callers feed this straight into `buffered` / `buffer_unordered`, and a + /// window of 0 makes those streams never poll their input — e.g. a metadata-only `count_rows` + /// would hang rather than return. pub fn io_parallelism(&self) -> usize { std::env::var("LANCE_IO_THREADS") .map(|val| val.parse::().unwrap()) .unwrap_or(self.io_parallelism) + .max(1) } /// Get the IO tracker for this object store @@ -1158,6 +1165,35 @@ mod tests { Ok(contents) } + #[test] + fn test_io_parallelism_clamped_to_nonzero() { + // `io_parallelism()` feeds `buffered`/`buffer_unordered` windows; a value of 0 makes those + // streams never poll, hanging callers (e.g. a metadata-only `count_rows`). It must clamp. + let store = ObjectStore::local(); + + // SAFETY: process-global env var, set and restored within this test. `io_parallelism()` + // only reads it, and a concurrent reader observes a valid clamped value, never 0. + unsafe { std::env::set_var("LANCE_IO_THREADS", "0") }; + assert_eq!( + store.io_parallelism(), + 1, + "LANCE_IO_THREADS=0 must clamp to 1" + ); + + unsafe { std::env::set_var("LANCE_IO_THREADS", "8") }; + assert_eq!( + store.io_parallelism(), + 8, + "a positive override must pass through unchanged" + ); + + unsafe { std::env::remove_var("LANCE_IO_THREADS") }; + assert!( + store.io_parallelism() >= 1, + "the configured default parallelism must be at least 1" + ); + } + #[tokio::test] async fn test_absolute_paths() { let tmp_path = TempStrDir::default(); From 5cfc088b0e7fa0e1d679677a2d7991265d72d5d9 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 23 Jun 2026 17:21:09 +0200 Subject: [PATCH 173/177] feat: expose staged index segment transactions Add `Dataset::build_existing_index_segments_transaction`, which builds the `Operation::CreateIndex` transaction for existing physical index segments without committing it. Callers commit the returned transaction via `CommitBuilder` for a strict stage-then-commit workflow, mirroring `InsertBuilder::execute_uncommitted`. `commit_existing_index_segments` now delegates to it. The method is inherent on `Dataset` rather than a new required method on the public `DatasetIndexExt` trait, so it is non-breaking for downstream trait implementors. Fixes #6666 --- rust/lance/src/index.rs | 513 ++++++++++++++++++++++++++++++++-------- 1 file changed, 414 insertions(+), 99 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 1a3a3aa54ec..45aa96a3fd0 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -878,6 +878,155 @@ impl IndexDescription for IndexDescriptionImpl { } } +impl Dataset { + /// Build, without committing, the transaction that publishes one or more + /// existing physical index segments as a logical index. + /// + /// This stages the same manifest update as + /// [`commit_existing_index_segments`](DatasetIndexExt::commit_existing_index_segments) + /// but does not advance the dataset version. Use + /// [`CommitBuilder`](crate::dataset::CommitBuilder) to commit the returned + /// [`Transaction`]. + /// + /// The transaction is a snapshot built against the current dataset version, + /// so commit it promptly. A concurrent index creation with the same name is + /// rejected at commit time with a retryable conflict, but other concurrent + /// changes to the same index between staging and commit — a compaction/rewrite + /// that remaps it, or dropping/renaming the indexed column — are not + /// conflict-checked and may leave a duplicate or stale index entry. + /// + /// # Side effects + /// + /// For most index types this only reads the segment directories. For inverted + /// (full-text) segments it also finalizes the segment's on-disk files within + /// the segment's UUID directory before returning. Finalization is idempotent, + /// and any files left behind if the returned transaction is never committed + /// are reclaimed by `cleanup_old_versions` like other unreferenced index + /// files. + /// + /// # Example + /// + /// ``` + /// # use std::sync::Arc; + /// # use lance::Result; + /// # use lance::dataset::{CommitBuilder, Dataset}; + /// # use lance::index::IndexSegment; + /// # async fn example(dataset: Arc, segments: Vec) -> Result<()> { + /// let transaction = dataset + /// .build_existing_index_segments_transaction("vector_idx", "vector", segments) + /// .await?; + /// CommitBuilder::new(dataset).execute(transaction).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn build_existing_index_segments_transaction( + &self, + index_name: &str, + column: &str, + segments: Vec, + ) -> Result { + let Some(field) = self.schema().field(column) else { + return Err(Error::index(format!( + "CreateIndex: column '{column}' does not exist" + ))); + }; + + let segments = segments + .into_iter() + .map(IntoIndexSegment::into_index_segment) + .collect::>>()?; + let new_indices = + build_index_metadata_from_segments(self, index_name, field.id, segments).await?; + validate_segment_metadata(index_name, &new_indices)?; + validate_segment_index_details(index_name, &new_indices)?; + + let incoming_type_url = new_indices[0] + .index_details + .as_ref() + .map(|details| details.type_url.clone()); + let dataset_fragments = self.fragment_bitmap.as_ref().clone(); + let mut incoming_fragments = RoaringBitmap::new(); + for segment in &new_indices { + if segment.fields != [field.id] { + return Err(Error::invalid_input(format!( + "CreateIndex: segment {} was built for fields {:?}, expected [{}]", + segment.uuid, segment.fields, field.id + ))); + } + if let Some(fragment_bitmap) = &segment.fragment_bitmap { + incoming_fragments |= fragment_bitmap.clone(); + } + } + + let existing_named_indices = self.load_indices_by_name(index_name).await?; + if existing_named_indices + .iter() + .any(|idx| idx.fields != [field.id]) + { + return Err(Error::index(format!( + "Index name '{index_name}' already exists with different fields, \ + please specify a different name" + ))); + } + let removed_indices = existing_named_indices + .into_iter() + .filter(|idx| { + idx.index_details + .as_ref() + .zip(incoming_type_url.as_deref()) + .is_none_or(|(details, expected)| details.type_url == expected) + }) + .map(|idx| -> Result> { + let Some(existing_fragments) = idx.effective_fragment_bitmap(&dataset_fragments) + else { + if incoming_fragments != dataset_fragments { + return Err(Error::invalid_input(format!( + "CreateIndex: cannot replace legacy index segment {} for '{}' with partial fragment coverage; rebuild all fragments in one commit", + idx.uuid, index_name + ))); + } + return Ok(Some(idx)); + }; + + // A zero-fragment segment can be used to create an index while + // deferring the actual build. Such a segment is disjoint from every + // other segment but should still be removed. + if existing_fragments.is_empty() { + return Ok(Some(idx)); + } + + if existing_fragments.is_disjoint(&incoming_fragments) { + return Ok(None); + } + + let uncovered = existing_fragments - &incoming_fragments; + if !uncovered.is_empty() { + return Err(Error::invalid_input(format!( + "CreateIndex: incoming segments for '{}' would orphan fragments {:?} from existing segment {}", + index_name, + uncovered.iter().collect::>(), + idx.uuid + ))); + } + + Ok(Some(idx)) + }) + .collect::>>()? + .into_iter() + .flatten() + .collect::>(); + + Ok(Transaction::new( + self.manifest.version, + Operation::CreateIndex { + new_indices, + removed_indices, + }, + None, + )) + } +} + #[async_trait] impl DatasetIndexExt for Dataset { type IndexBuilder<'a> = CreateIndexBuilder<'a>; @@ -1186,105 +1335,9 @@ impl DatasetIndexExt for Dataset { column: &str, segments: Vec, ) -> Result<()> { - let Some(field) = self.schema().field(column) else { - return Err(Error::index(format!( - "CreateIndex: column '{column}' does not exist" - ))); - }; - - let segments = segments - .into_iter() - .map(IntoIndexSegment::into_index_segment) - .collect::>>()?; - let new_indices = - build_index_metadata_from_segments(self, index_name, field.id, segments).await?; - validate_segment_metadata(index_name, &new_indices)?; - validate_segment_index_details(index_name, &new_indices)?; - - let incoming_type_url = new_indices[0] - .index_details - .as_ref() - .map(|details| details.type_url.clone()); - let dataset_fragments = self.fragment_bitmap.as_ref().clone(); - let mut incoming_fragments = RoaringBitmap::new(); - for segment in &new_indices { - if segment.fields != [field.id] { - return Err(Error::invalid_input(format!( - "CreateIndex: segment {} was built for fields {:?}, expected [{}]", - segment.uuid, segment.fields, field.id - ))); - } - if let Some(fragment_bitmap) = &segment.fragment_bitmap { - incoming_fragments |= fragment_bitmap.clone(); - } - } - - let existing_named_indices = self.load_indices_by_name(index_name).await?; - if existing_named_indices - .iter() - .any(|idx| idx.fields != [field.id]) - { - return Err(Error::index(format!( - "Index name '{index_name}' already exists with different fields, \ - please specify a different name" - ))); - } - let removed_indices = existing_named_indices - .into_iter() - .filter(|idx| { - idx.index_details - .as_ref() - .zip(incoming_type_url.as_deref()) - .is_none_or(|(details, expected)| details.type_url == expected) - }) - .map(|idx| -> Result> { - let Some(existing_fragments) = idx.effective_fragment_bitmap(&dataset_fragments) - else { - if incoming_fragments != dataset_fragments { - return Err(Error::invalid_input(format!( - "CreateIndex: cannot replace legacy index segment {} for '{}' with partial fragment coverage; rebuild all fragments in one commit", - idx.uuid, index_name - ))); - } - return Ok(Some(idx)); - }; - - // A zero-fragment segment can be used to create an index while - // deferring the actual build. Such a segment is disjoint from every - // other segment but should still be removed. - if existing_fragments.is_empty() { - return Ok(Some(idx)); - } - - if existing_fragments.is_disjoint(&incoming_fragments) { - return Ok(None); - } - - let uncovered = existing_fragments - &incoming_fragments; - if !uncovered.is_empty() { - return Err(Error::invalid_input(format!( - "CreateIndex: incoming segments for '{}' would orphan fragments {:?} from existing segment {}", - index_name, - uncovered.iter().collect::>(), - idx.uuid - ))); - } - - Ok(Some(idx)) - }) - .collect::>>()? - .into_iter() - .flatten() - .collect::>(); - - let transaction = Transaction::new( - self.manifest.version, - Operation::CreateIndex { - new_indices, - removed_indices, - }, - None, - ); + let transaction = self + .build_existing_index_segments_transaction(index_name, column, segments) + .await?; self.apply_commit(transaction, &Default::default(), &Default::default()) .await?; @@ -6947,6 +7000,268 @@ mod tests { ); } + #[tokio::test] + async fn test_build_existing_index_segments_transaction_does_not_commit() { + use crate::dataset::CommitBuilder; + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(2)); + + let dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 10, + max_rows_per_group: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + // 20 rows with max_rows_per_file=10 yields two single-fragment files. + assert_eq!(dataset.get_fragments().len(), 2); + + let read_version = dataset.manifest.version; + let field_id = dataset.schema().field("vector").unwrap().id; + let seg0 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg0", + ) + .await; + let seg1 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [1_u32], + b"seg1", + ) + .await; + + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg0), segment_from_metadata(&seg1)], + ) + .await + .unwrap(); + + // Building the transaction must not publish the index. + assert!( + dataset + .load_indices_by_name("vector_idx") + .await + .unwrap() + .is_empty(), + "building a transaction must not publish the index" + ); + assert_eq!(transaction.read_version, read_version); + let Operation::CreateIndex { + new_indices, + removed_indices, + } = &transaction.operation + else { + panic!("expected index creation transaction"); + }; + assert_eq!( + new_indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + assert!(removed_indices.is_empty()); + + // The returned transaction can be committed via CommitBuilder. + let committed = CommitBuilder::new(Arc::new(dataset)) + .execute(transaction) + .await + .unwrap(); + let indices = committed.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + } + + #[tokio::test] + async fn test_build_existing_index_segments_transaction_removes_empty_segment() { + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let mut dataset = Dataset::write(reader, test_dir.path().to_str().unwrap(), None) + .await + .unwrap(); + let field_id = dataset.schema().field("vector").unwrap().id; + + // Commit a 0-fragment placeholder segment. + let empty = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + std::iter::empty::(), + b"empty", + ) + .await; + dataset + .commit_existing_index_segments( + "vector_idx", + "vector", + vec![segment_from_metadata(&empty)], + ) + .await + .unwrap(); + + // Staging a real segment that covers the dataset must mark the placeholder + // for removal at build time, exercising the zero-fragment guard on the + // staged path (not just the committed path). + let seg = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg", + ) + .await; + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg)], + ) + .await + .unwrap(); + let Operation::CreateIndex { + new_indices, + removed_indices, + } = &transaction.operation + else { + panic!("expected index creation transaction"); + }; + assert_eq!( + new_indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg.uuid]), + ); + assert_eq!( + removed_indices + .iter() + .map(|i| i.uuid) + .collect::>(), + HashSet::from([empty.uuid]), + "the zero-fragment placeholder must be staged for removal", + ); + } + + #[tokio::test] + async fn test_build_existing_index_segments_transaction_commits_after_version_advances() { + use crate::dataset::CommitBuilder; + use lance_datagen::{BatchCount, RowCount, array}; + + let test_dir = tempfile::tempdir().unwrap(); + let test_uri = test_dir.path().to_str().unwrap(); + + let reader = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(2)); + let dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + max_rows_per_file: 10, + max_rows_per_group: 10, + ..Default::default() + }), + ) + .await + .unwrap(); + let read_version = dataset.manifest.version; + let field_id = dataset.schema().field("vector").unwrap().id; + let seg0 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [0_u32], + b"seg0", + ) + .await; + let seg1 = write_vector_segment_metadata( + &dataset, + "vector_idx", + field_id, + Uuid::new_v4(), + [1_u32], + b"seg1", + ) + .await; + + // Stage the transaction at `read_version`. + let transaction = dataset + .build_existing_index_segments_transaction( + "vector_idx", + "vector", + vec![segment_from_metadata(&seg0), segment_from_metadata(&seg1)], + ) + .await + .unwrap(); + assert_eq!(transaction.read_version, read_version); + + // Advance the dataset with an unrelated append, moving HEAD past read_version. + let more = lance_datagen::gen_batch() + .col("id", array::step::()) + .col( + "vector", + array::rand_vec::(8.into()), + ) + .into_reader_rows(RowCount::from(10), BatchCount::from(1)); + let dataset = Dataset::write( + more, + test_uri, + Some(WriteParams { + mode: WriteMode::Append, + ..Default::default() + }), + ) + .await + .unwrap(); + assert!(dataset.manifest.version > read_version); + + // The staged transaction still commits cleanly against the advanced HEAD. + let committed = CommitBuilder::new(Arc::new(dataset)) + .execute(transaction) + .await + .unwrap(); + let indices = committed.load_indices_by_name("vector_idx").await.unwrap(); + assert_eq!( + indices.iter().map(|i| i.uuid).collect::>(), + HashSet::from([seg0.uuid, seg1.uuid]), + ); + } + #[tokio::test] async fn test_resolve_index_column_error_cases() { use lance_datagen::{BatchCount, RowCount, array}; From a128e52f8bf520e2a99ea2142a952f9fe91840b0 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 23 Jun 2026 08:59:00 -0700 Subject: [PATCH 174/177] fix(update): keep nested-field index correct when updating a struct column (#7412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The update API rejects nested column references (`set` errors on a `.` in the column name), so a nested field can only be changed by setting its whole **top-level struct column** (e.g. `SET s = named_struct('x', …)`). `commit_impl` built `fields_for_preserving_frag_bitmap` from the struct column's own (parent) field id via `field_id(column_name)`, so an index on a nested **child** field (a different leaf id) was absent from the set. `register_pure_rewrite_rows_update_frags_in_indices` then wrongly extended that child-field index over the rewritten fragment, which was therefore treated as indexed and never re-scanned — so the updated rows were silently dropped from queries on the index (false negatives). The fix collects the full field subtree of each updated column, so a struct-column update marks all descendant field ids as modified. Flat columns are unaffected (no children). This is the SQL `UPDATE` counterpart of the merge_insert fix in lance-format/lance#7410. Adds a regression test (`test_update_struct_column_keeps_nested_index`): a BTree index on `s.x`, an update of the struct column `s`, and an assertion that the index's effective fragment bitmap is not extended over the rewritten fragment so the updated value is still found. --- .../src/dataset/tests/dataset_merge_update.rs | 119 +++++++++++++++++- rust/lance/src/dataset/write/update.rs | 18 ++- 2 files changed, 133 insertions(+), 4 deletions(-) diff --git a/rust/lance/src/dataset/tests/dataset_merge_update.rs b/rust/lance/src/dataset/tests/dataset_merge_update.rs index c6f448040c2..c96a3db915f 100644 --- a/rust/lance/src/dataset/tests/dataset_merge_update.rs +++ b/rust/lance/src/dataset/tests/dataset_merge_update.rs @@ -8,7 +8,7 @@ use crate::dataset::ROW_ID; use crate::dataset::WriteDestination; use crate::dataset::optimize::{CompactionOptions, compact_files}; use crate::dataset::transaction::{DataReplacementGroup, Operation}; -use crate::dataset::{AutoCleanupParams, MergeInsertBuilder, ProjectionRequest}; +use crate::dataset::{AutoCleanupParams, MergeInsertBuilder, ProjectionRequest, UpdateBuilder}; use crate::index::DatasetIndexExt; use crate::{Dataset, Error}; use lance_core::ROW_ADDR; @@ -28,7 +28,7 @@ use arrow_array::{ ArrayRef, Float32Array, Int32Array, ListArray, RecordBatchIterator, StringArray, types::Int32Type, }; -use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, Field as ArrowField, Fields, Schema as ArrowSchema}; use lance_arrow::BLOB_META_KEY; use lance_core::utils::tempfile::{TempDir, TempStrDir}; use lance_datafusion::utils::reader_to_stream; @@ -1627,6 +1627,121 @@ async fn test_merge_insert_with_reordered_columns_and_index() { final_dataset.validate().await.unwrap(); } +/// With stable row ids, updating a top-level struct column keeps a scalar index on a +/// nested child field correct. The update API rejects nested column references, so a +/// nested field can only be changed by setting its whole struct column; that update must +/// not wrongly extend the child-field index over the rewritten fragment (which would +/// leave the updated value unscanned and silently dropped). +#[tokio::test] +async fn test_update_struct_column_keeps_nested_index() { + let struct_fields = Fields::from(vec![ArrowField::new("x", DataType::Int32, true)]); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("s", DataType::Struct(struct_fields.clone()), true), + ])); + let s_arr = StructArray::new( + struct_fields.clone(), + vec![Arc::new(Int32Array::from(vec![10, 20, 30])) as ArrayRef], + None, + ); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + Arc::new(s_arr) as ArrayRef, + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + "memory://test_update_nested_index", + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + // BTree index on the NESTED field `s.x`. + dataset + .create_index( + &["s.x"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + false, + ) + .await + .unwrap(); + + let pre = dataset + .scan() + .filter("s.x = 20") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(pre.num_rows(), 1, "precondition: s.x=20 should match id=2"); + + // Nested column references are rejected by `set`, so update the whole struct column + // `s` for id=2, changing s.x 20 -> 999. + let update_result = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id = 2") + .unwrap() + .set("s", "named_struct('x', cast(999 as int))") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + let dataset = update_result.new_dataset; + + // The nested `s.x` index must NOT be extended to the rewritten fragment: its + // effective coverage stays {0}, so the rewritten fragment is left unindexed and + // fully scanned. + let sx_idx = dataset + .load_indices() + .await + .unwrap() + .iter() + .find(|i| i.fields.len() == 1) + .expect("nested s.x index") + .clone(); + let effective = sx_idx + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .expect("index has a fragment bitmap"); + assert_eq!( + effective.iter().collect::>(), + vec![0], + "nested-field index must not be extended to the rewritten fragment" + ); + + // The updated value must be found, and the stale value gone. + let new = dataset + .scan() + .filter("s.x = 999") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + new.num_rows(), + 1, + "updated value s.x=999 must be found after the struct-column update" + ); + let old = dataset + .scan() + .filter("s.x = 20") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!(old.num_rows(), 0, "s.x=20 should no longer match any row"); +} + /// DataReplacement should invalidate index fragment bitmaps for replaced fields. #[tokio::test] async fn test_data_replacement_invalidates_index_bitmap() { diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index 6672c58db1f..f8e71b834a7 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -34,6 +34,16 @@ use lance_table::format::{Fragment, RowIdMeta}; use roaring::RoaringTreemap; use snafu::ResultExt; +/// Collect a field id and all of its descendant field ids (pre-order). A struct +/// column update rewrites the whole subtree, so an index on any descendant must be +/// treated as modified. +fn collect_subtree_field_ids(field: &lance_core::datatypes::Field, out: &mut Vec) { + out.push(field.id as u32); + for child in &field.children { + collect_subtree_field_ids(child, out); + } +} + /// Build an update operation. /// /// This operation is similar to SQL's UPDATE statement. It allows you to change @@ -386,10 +396,14 @@ impl UpdateJob { dataset: Arc, update_data: UpdateData, ) -> Result { + // Updated columns are top-level (nested references are rejected by `set`), but a + // struct-column update rewrites all of its descendants. Collect the full field + // subtree so an index on a nested child field is recognized as modified and not + // wrongly extended over the rewritten fragment. let mut fields_for_preserving_frag_bitmap = Vec::new(); for column_name in self.updates.keys() { - if let Ok(field_id) = dataset.schema().field_id(column_name) { - fields_for_preserving_frag_bitmap.push(field_id as u32); + if let Some(field) = dataset.schema().field(column_name) { + collect_subtree_field_ids(field, &mut fields_for_preserving_frag_bitmap); } } From 5f7f6c9515b7303f2127884a106264c9fc076522 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 23 Jun 2026 09:15:21 -0700 Subject: [PATCH 175/177] perf(index): reduce TwoFileShuffler peak memory via interleave sort (#7295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Replaces `rechunk_stream_by_size` + `concat_batches` + `take` (two full-data copies, peak ~3–4× `batch_size_bytes`) with a single-pass sort over the UInt32 part-id columns only, producing `(batch_idx, row_idx)` interleave indices. - Sorted output is streamed to the data file via `interleave_batches` in 8 Ki-row chunks, so the interleave output adds only a small constant overhead above the accumulated source data. - Peak memory drops to **~1× `batch_size_bytes`**, which enables setting `LANCE_SHUFFLE_BATCH_BYTES` much larger to reduce flush-group count and improve read-time I/O locality. Closes #7299. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Sonnet 4.6 --- rust/lance-index/src/vector/v3/shuffler.rs | 311 +++++++++++++++------ 1 file changed, 221 insertions(+), 90 deletions(-) diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs index 0b76517e1c2..4203d099d0b 100644 --- a/rust/lance-index/src/vector/v3/shuffler.rs +++ b/rust/lance-index/src/vector/v3/shuffler.rs @@ -5,8 +5,7 @@ //! the corresponding IVF partitions. use std::ops::Range; -use std::sync::atomic::AtomicU64; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use arrow::compute::concat_batches; use arrow::datatypes::UInt64Type; @@ -14,8 +13,7 @@ use arrow::{array::AsArray, compute::sort_to_indices}; use arrow_array::{RecordBatch, UInt32Array, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use futures::{future::try_join_all, prelude::*}; -use lance_arrow::stream::rechunk_stream_by_size; -use lance_arrow::{RecordBatchExt, SchemaExt}; +use lance_arrow::{RecordBatchExt, SchemaExt, interleave_batches}; use lance_core::{ Error, Result, cache::LanceCache, @@ -341,6 +339,11 @@ pub fn create_ivf_shuffler( const DEFAULT_SHUFFLE_BATCH_BYTES: usize = 128 * 1024 * 1024; +/// Number of rows per output batch when streaming sorted data via interleave. +/// Small enough to keep the output chunk's memory footprint modest relative to +/// the accumulated source data. +const SHUFFLE_WRITE_CHUNK_ROWS: usize = 8 * 1024; + /// Limit of how much transformed data we accumulate before spilling to disk. /// /// A larger value will use more RAM but require less random access during the @@ -407,6 +410,51 @@ impl TwoFileShuffler { } } +/// `(batch_idx, row_idx)` pairs produced by [`sort_to_interleave_indices`], paired with +/// per-partition row counts. +type InterleaveResult = (Vec<(usize, usize)>, Vec); + +/// Sorts rows from multiple batches by partition ID and returns interleave indices. +/// +/// Builds a sort key of `(part_id, batch_idx, row_idx)` for every row across all +/// batches, sorts by `part_id`, then emits `(batch_idx, row_idx)` pairs in that +/// order. This avoids concatenating the full data: only the `UInt32` partition-ID +/// columns are touched here. +/// +/// Also returns per-partition row counts (derived from the same sorted keys at no +/// extra cost). +/// +/// Returns an error if any partition ID is out of range `[0, num_partitions)`. +fn sort_to_interleave_indices( + part_id_columns: &[&UInt32Array], + num_partitions: usize, +) -> Result { + let total_rows: usize = part_id_columns.iter().map(|a| a.len()).sum(); + let mut keys: Vec<(u32, u32, u32)> = Vec::with_capacity(total_rows); + for (batch_idx, col) in part_id_columns.iter().enumerate() { + let batch_idx = batch_idx as u32; + for (row_idx, &part_id) in col.values().iter().enumerate() { + keys.push((part_id, batch_idx, row_idx as u32)); + } + } + keys.sort_unstable_by_key(|k| k.0); + + let mut partition_counts = vec![0u64; num_partitions]; + let mut interleave_indices = Vec::with_capacity(total_rows); + for (part_id, batch_idx, row_idx) in &keys { + let pid = *part_id as usize; + if pid >= num_partitions { + return Err(Error::invalid_input(format!( + "partition ID {} is out of range [0, {})", + pid, num_partitions + ))); + } + partition_counts[pid] += 1; + interleave_indices.push((*batch_idx as usize, *row_idx as usize)); + } + Ok((interleave_indices, partition_counts)) +} + #[async_trait::async_trait] impl Shuffler for TwoFileShuffler { async fn shuffle( @@ -414,8 +462,7 @@ impl Shuffler for TwoFileShuffler { data: Box, ) -> Result> { let num_partitions = self.num_partitions; - let full_schema = Arc::new(data.schema().as_ref().clone()); - // No need to write partition ids since we can infer this + // No need to write partition ids since we can infer this from offsets let schema = data.schema().without_column(PART_ID_COLUMN); let offsets_schema = Arc::new(Schema::new(vec![Field::new( "offset", @@ -424,28 +471,6 @@ impl Shuffler for TwoFileShuffler { )])); let batch_size_bytes = self.batch_size_bytes; - // Extract loss from batch metadata before rechunking (concat_batches drops metadata) - let total_loss = Arc::new(Mutex::new(0.0f64)); - let loss_ref = total_loss.clone(); - let loss_stream = data.map(move |result| { - result.inspect(|batch| { - let loss = batch - .metadata() - .get(LOSS_METADATA_KEY) - .and_then(|s| s.parse::().ok()) - .unwrap_or(0.0); - *loss_ref.lock().unwrap() += loss; - }) - }); - - // Rechunk to target batch size - let rechunked = rechunk_stream_by_size( - loss_stream, - full_schema, - batch_size_bytes, - batch_size_bytes * 2, - ); - // Create data file writer let data_path = self.output_dir.clone().join("shuffle_data.lance"); let spill_path = self.output_dir.clone().join("shuffle_data.spill"); @@ -468,72 +493,63 @@ impl Shuffler for TwoFileShuffler { )? .with_page_metadata_spill(self.object_store.clone(), spill_path); - let num_batches = Arc::new(AtomicU64::new(0)); - let num_batches_ref = num_batches.clone(); + let mut num_batches: u64 = 0; let mut partition_counts: Vec = vec![0; num_partitions]; let mut global_row_count: u64 = 0; let mut rows_processed: u64 = 0; + let mut total_loss = 0.0f64; + let mut accumulated: Vec = Vec::new(); + let mut acc_bytes: usize = 0; - let mut rechunked = std::pin::pin!(rechunked); - while let Some(batch) = rechunked.next().await { - num_batches_ref.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let mut data = std::pin::pin!(data); + while let Some(batch) = data.next().await { let batch = batch?; - let np = num_partitions; - let num_rows = batch.num_rows() as u64; - - // Sort by partition ID and compute offsets on CPU - let (sorted_batch, batch_offsets) = spawn_cpu(move || { - let part_ids: &UInt32Array = batch[PART_ID_COLUMN].as_primitive(); - let indices = sort_to_indices(part_ids, None, None)?; - let batch = batch.take(&indices)?; - - let part_ids: &UInt32Array = batch[PART_ID_COLUMN].as_primitive(); - let batch = batch.drop_column(PART_ID_COLUMN)?; - - // Count rows per partition by scanning sorted part IDs - let mut partition_counts = vec![0u64; np]; - for i in 0..part_ids.len() { - let pid = part_ids.value(i) as usize; - if pid < np { - partition_counts[pid] += 1; - } else { - log::warn!("Partition ID {} is out of range [0, {})", pid, np); - } - } - - // Build cumulative offsets (end positions) for this batch - let mut batch_offsets = Vec::with_capacity(np); - let mut running = 0u64; - for count in &partition_counts { - running += count; - batch_offsets.push(running); + total_loss += batch + .metadata() + .get(LOSS_METADATA_KEY) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0.0); + acc_bytes += batch.get_array_memory_size(); + accumulated.push(batch); + + if acc_bytes >= batch_size_bytes { + let (total_rows, counts) = flush_shuffle_batch( + std::mem::take(&mut accumulated), + &mut file_writer, + &mut offsets_writer, + offsets_schema.clone(), + num_partitions, + global_row_count, + ) + .await?; + acc_bytes = 0; + for (p, c) in counts.iter().enumerate() { + partition_counts[p] += c; } + global_row_count += total_rows; + rows_processed += total_rows; + num_batches += 1; + self.progress + .stage_progress("shuffle", rows_processed) + .await?; + } + } - Ok::<(RecordBatch, Vec), Error>((batch, batch_offsets)) - }) + if !accumulated.is_empty() { + let (total_rows, counts) = flush_shuffle_batch( + accumulated, + &mut file_writer, + &mut offsets_writer, + offsets_schema, + num_partitions, + global_row_count, + ) .await?; - - // Write sorted batch to data file - file_writer.write_batch(&sorted_batch).await?; - - // Record offsets adjusted by global row count - let mut adjusted_offsets = Vec::with_capacity(batch_offsets.len()); - let mut last_offset = 0; - for (idx, offset) in batch_offsets.iter().enumerate() { - adjusted_offsets.push(global_row_count + offset); - partition_counts[idx] += offset - last_offset; - last_offset = *offset; + for (p, c) in counts.iter().enumerate() { + partition_counts[p] += c; } - global_row_count += sorted_batch.num_rows() as u64; - - // Write offsets to offsets file - let offsets_batch = RecordBatch::try_new( - offsets_schema.clone(), - vec![Arc::new(UInt64Array::from(adjusted_offsets))], - )?; - offsets_writer.write_batch(&offsets_batch).await?; - - rows_processed += num_rows; + rows_processed += total_rows; + num_batches += 1; self.progress .stage_progress("shuffle", rows_processed) .await?; @@ -543,22 +559,76 @@ impl Shuffler for TwoFileShuffler { file_writer.finish().await?; offsets_writer.finish().await?; - let num_batches = num_batches.load(std::sync::atomic::Ordering::Relaxed); - - let total_loss_val = *total_loss.lock().unwrap(); - TwoFileShuffleReader::try_new( self.object_store.clone(), self.output_dir.clone(), num_partitions, num_batches, partition_counts, - total_loss_val, + total_loss, ) .await } } +/// Sorts `accumulated` batches by partition ID and writes the result to the data +/// and offsets files. +/// +/// Returns `(total_rows_written, per_partition_row_counts)`. +async fn flush_shuffle_batch( + accumulated: Vec, + file_writer: &mut FileWriter, + offsets_writer: &mut FileWriter, + offsets_schema: Arc, + num_partitions: usize, + global_row_count: u64, +) -> Result<(u64, Vec)> { + let total_rows: u64 = accumulated.iter().map(|b| b.num_rows() as u64).sum(); + + // Clone part-id columns into the CPU task (cheap: Arc ref bump, not data copy). + let part_id_cols: Vec = accumulated + .iter() + .map(|b| { + let col: &UInt32Array = b[PART_ID_COLUMN].as_primitive(); + col.clone() + }) + .collect(); + + let np = num_partitions; + let (interleave_indices, batch_partition_counts) = + spawn_cpu(move || sort_to_interleave_indices(&part_id_cols.iter().collect::>(), np)) + .await?; + + // Drop part-id column from source batches before interleaving. + let source_batches: Vec = accumulated + .into_iter() + .map(|b| b.drop_column(PART_ID_COLUMN).map_err(Error::from)) + .collect::>()?; + + // Stream sorted output to the data file in fixed-size chunks so the peak + // memory for the interleave output stays small relative to the source data. + for chunk in interleave_indices.chunks(SHUFFLE_WRITE_CHUNK_ROWS) { + let out = interleave_batches(&source_batches, chunk)?; + file_writer.write_batch(&out).await?; + } + + // Compute cumulative end-row offsets (adjusted by global position) and write + // one offsets batch for this flush group. + let mut adjusted_offsets = Vec::with_capacity(num_partitions); + let mut running = 0u64; + for count in &batch_partition_counts { + running += count; + adjusted_offsets.push(global_row_count + running); + } + let offsets_batch = RecordBatch::try_new( + offsets_schema, + vec![Arc::new(UInt64Array::from(adjusted_offsets))], + )?; + offsets_writer.write_batch(&offsets_batch).await?; + + Ok((total_rows, batch_partition_counts)) +} + pub struct TwoFileShuffleReader { _scheduler: Arc, file_reader: FileReader, @@ -934,4 +1004,65 @@ mod tests { assert!((reader.total_loss().unwrap() - 6.0).abs() < 1e-10); } + + #[tokio::test] + async fn test_two_file_shuffler_multi_batch_single_flush() { + // All three batches fit within the default batch_size_bytes, so they + // accumulate and are interleaved in a single flush group. This exercises + // the cross-batch interleave path. + let dir = TempStrDir::default(); + let output_dir = Path::from(dir.as_ref()); + let num_partitions = 3; + + let batch1 = make_batch(&[0, 1, 2], &[10, 20, 30], None); + let batch2 = make_batch(&[2, 0, 1], &[40, 50, 60], None); + let batch3 = make_batch(&[1, 2, 0], &[70, 80, 90], None); + + // Large batch_size_bytes so all three batches flush together. + let shuffler = + TwoFileShuffler::new(output_dir, num_partitions).with_batch_size_bytes(1024 * 1024); + let stream = batches_to_stream(vec![batch1, batch2, batch3]); + let reader = shuffler.shuffle(stream).await.unwrap(); + + assert_eq!(reader.partition_size(0).unwrap(), 3); + assert_eq!(reader.partition_size(1).unwrap(), 3); + assert_eq!(reader.partition_size(2).unwrap(), 3); + + let p0 = collect_partition(reader.as_ref(), 0).await.unwrap(); + let vals: &Int32Array = p0.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![10, 50, 90]); + + let p1 = collect_partition(reader.as_ref(), 1).await.unwrap(); + let vals: &Int32Array = p1.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![20, 60, 70]); + + let p2 = collect_partition(reader.as_ref(), 2).await.unwrap(); + let vals: &Int32Array = p2.column_by_name("val").unwrap().as_primitive(); + let mut v: Vec = vals.iter().map(|x| x.unwrap()).collect(); + v.sort(); + assert_eq!(v, vec![30, 40, 80]); + } + + #[tokio::test] + async fn test_two_file_shuffler_out_of_range_partition_id() { + let dir = TempStrDir::default(); + let output_dir = Path::from(dir.as_ref()); + + // Row with partition ID 5 is out of range for num_partitions=3. + let batch = make_batch(&[0, 5, 1], &[10, 20, 30], None); + + let shuffler = TwoFileShuffler::new(output_dir, 3); + let stream = batches_to_stream(vec![batch]); + let Err(err) = shuffler.shuffle(stream).await else { + panic!("expected an error for out-of-range partition ID"); + }; + assert!( + err.to_string().contains("partition ID 5 is out of range"), + "unexpected error: {err}" + ); + } } From 666e5b2ae519167c1c29747837b153e49899fca3 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Tue, 23 Jun 2026 09:46:19 -0700 Subject: [PATCH 176/177] fix(index): drop stale scalar index entries after stable-row-id update (#7359) Under stable row ids an update deletes a row's old copy and rewrites it to a new fragment under the same row id. optimize_indices kept the old value->row_id entry, so queries for the old value returned the updated row and BTree optimize errored ("from_sorted_iter called with non-sorted input"). - build_stable_row_id_filter now subtracts each fragment's deletion vector so the old-row allow-list holds only live rows (fixes BTree). - BitmapIndex::update applies that filter to old postings via OldIndexDataFilter::retain_old_rows. - optimize routes FTS through InvertedIndex::merge_segments (which filters old partitions) instead of the reference-only update path. Adds a regression test covering all three index types. --------- Co-authored-by: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar.rs | 11 + rust/lance-index/src/scalar/bitmap.rs | 56 ++- rust/lance/src/index/append.rs | 533 ++++++++++++++++++++++++-- 3 files changed, 569 insertions(+), 31 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index a287d277a81..3a6834129b3 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -960,6 +960,17 @@ impl OldIndexDataFilter { .collect(), } } + + /// Apply this filter in place to a set of existing (old) row ids/addresses, + /// retaining only the rows the filter selects to keep. Used by index types + /// that merge old postings directly (e.g. bitmap) instead of re-scanning a + /// row-id array through [`Self::filter_row_ids`]. + pub fn retain_old_rows(&self, rows: &mut RowAddrTreeMap) { + match self { + Self::Fragments { to_keep, .. } => rows.retain_fragments(to_keep.iter()), + Self::RowIds(valid_row_ids) => *rows &= valid_row_ids, + } + } } impl UpdateCriteria { diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 8729aadbca2..23b300a2d73 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -806,13 +806,14 @@ impl ScalarIndex for BitmapIndex { &self, new_data: SendableRecordBatchStream, dest_store: &dyn IndexStore, - _old_data_filter: Option, + old_data_filter: Option, ) -> Result { let file = BitmapIndexPlugin::streaming_build_and_write( new_data, Some(self), dest_store, BITMAP_LOOKUP_NAME, + old_data_filter.as_ref(), ) .await?; @@ -1191,6 +1192,19 @@ async fn cleanup_bitmap_shard_files(store: &dyn IndexStore, shard_files: &[Strin #[derive(Debug, Default)] pub struct BitmapIndexPlugin; +/// Drop the rows an old posting should no longer expose -- rows whose fragment +/// was removed, or (under stable row ids) rows rewritten by an update -- keeping +/// only those `filter` still considers valid. A no-op when `filter` is `None`. +fn retain_valid( + mut bitmap: RowAddrTreeMap, + filter: Option<&super::OldIndexDataFilter>, +) -> RowAddrTreeMap { + if let Some(filter) = filter { + filter.retain_old_rows(&mut bitmap); + } + bitmap +} + impl BitmapIndexPlugin { fn get_batch_from_arrays( keys: Arc, @@ -1322,7 +1336,7 @@ impl BitmapIndexPlugin { data: SendableRecordBatchStream, index_store: &dyn IndexStore, ) -> Result { - Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME).await + Self::streaming_build_and_write(data, None, index_store, BITMAP_LOOKUP_NAME, None).await } async fn train_bitmap_shard( @@ -1337,7 +1351,8 @@ impl BitmapIndexPlugin { progress .stage_start("build_bitmap_shard", None, "rows") .await?; - let file = Self::streaming_build_and_write(data, None, index_store, &file_name).await?; + let file = + Self::streaming_build_and_write(data, None, index_store, &file_name, None).await?; progress.stage_complete("build_bitmap_shard").await?; Ok(file) } @@ -1354,6 +1369,7 @@ impl BitmapIndexPlugin { old_index: Option<&BitmapIndex>, index_store: &dyn IndexStore, output_file_name: &str, + old_data_filter: Option<&super::OldIndexDataFilter>, ) -> Result { let value_type = data_source.schema().field(0).data_type().clone(); @@ -1400,6 +1416,7 @@ impl BitmapIndexPlugin { &mut old_pos, &mut emitted_null, &mut writer, + old_data_filter, ) .await?; } @@ -1422,6 +1439,7 @@ impl BitmapIndexPlugin { &mut old_pos, &mut emitted_null, &mut writer, + old_data_filter, ) .await?; } @@ -1429,7 +1447,13 @@ impl BitmapIndexPlugin { // Emit any remaining old-only entries. if let Some(idx) = old_index { while old_pos < old_keys.len() { - let old_bitmap = idx.load_bitmap(&old_keys[old_pos], None).await?; + let old_bitmap = retain_valid( + idx.load_bitmap(&old_keys[old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); writer .emit(old_keys[old_pos].0.clone(), &old_bitmap) .await?; @@ -1444,7 +1468,8 @@ impl BitmapIndexPlugin { { let null_key = new_null_array(&value_type, 1); let null_key = ScalarValue::try_from_array(null_key.as_ref(), 0)?; - writer.emit(null_key, &idx.null_map).await?; + let null_bitmap = retain_valid((*idx.null_map).clone(), old_data_filter); + writer.emit(null_key, &null_bitmap).await?; } writer.finish().await @@ -1453,6 +1478,7 @@ impl BitmapIndexPlugin { /// Flush a completed value-run from the new data stream, emitting any /// old-only entries that sort before it and merging the old bitmap if the /// key exists in both old and new. + #[allow(clippy::too_many_arguments)] async fn finish_run( key: ScalarValue, bitmap: &mut RowAddrTreeMap, @@ -1461,13 +1487,14 @@ impl BitmapIndexPlugin { old_pos: &mut usize, emitted_null: &mut bool, writer: &mut BitmapBatchWriter, + old_data_filter: Option<&super::OldIndexDataFilter>, ) -> Result<()> { if key.is_null() { // Null values are stored separately in the old index's null_map. if let Some(idx) = old_index && !idx.null_map.is_empty() { - *bitmap |= &*idx.null_map; + *bitmap |= &retain_valid((*idx.null_map).clone(), old_data_filter); } *emitted_null = true; writer.emit(key, bitmap).await?; @@ -1476,7 +1503,13 @@ impl BitmapIndexPlugin { // Emit old-only entries that sort before this key. while *old_pos < old_keys.len() && old_keys[*old_pos] < orderable { - let old_bitmap = idx.load_bitmap(&old_keys[*old_pos], None).await?; + let old_bitmap = retain_valid( + idx.load_bitmap(&old_keys[*old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); writer .emit(old_keys[*old_pos].0.clone(), &old_bitmap) .await?; @@ -1485,8 +1518,13 @@ impl BitmapIndexPlugin { // If the old index also has this key, merge its bitmap. if *old_pos < old_keys.len() && old_keys[*old_pos] == orderable { - let old_bitmap = idx.load_bitmap(&old_keys[*old_pos], None).await?; - *bitmap |= &*old_bitmap; + *bitmap |= &retain_valid( + idx.load_bitmap(&old_keys[*old_pos], None) + .await? + .as_ref() + .clone(), + old_data_filter, + ); *old_pos += 1; } diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs index 99ff7bebe43..d3ecde030c1 100644 --- a/rust/lance/src/index/append.rs +++ b/rust/lance/src/index/append.rs @@ -65,16 +65,49 @@ async fn build_stable_row_id_filter( .try_collect::>() .await?; - let row_id_maps = row_id_sequences - .iter() - .map(|(_, seq)| RowAddrTreeMap::from(seq.as_ref())) - .collect::>(); + let frag_by_id: std::collections::HashMap = dataset + .get_fragments() + .into_iter() + .map(|f| (f.id() as u32, f)) + .collect(); + + let mut row_id_maps = Vec::with_capacity(row_id_sequences.len()); + for (frag_id, seq) in &row_id_sequences { + row_id_maps.push(live_row_ids(frag_by_id.get(frag_id), seq).await?); + } let row_id_map_refs = row_id_maps.iter().collect::>(); // Merge all fragment-local row-id sets into one exact membership structure. Ok(::union_all(&row_id_map_refs)) } +/// The fragment's live row ids: its persisted row-id sequence minus the rows +/// its deletion vector marks gone. A persisted sequence covers every row the +/// fragment ever held, so a row whose old copy was deleted (e.g. rewritten by an +/// update under the same stable row id) would otherwise be retained as a stale +/// old-index entry. +async fn live_row_ids( + fragment: Option<&crate::dataset::fragment::FileFragment>, + seq: &lance_table::rowids::RowIdSequence, +) -> Result { + // Propagate a deletion-vector read failure rather than swallowing it: a + // swallowed error would fall through to the "no deletions" branch below, + // putting the deleted rows back into the allow-list as stale entries. + let deletion_vector = match fragment { + Some(f) if f.metadata().deletion_file.is_some() => f.get_deletion_vector().await?, + _ => None, + }; + Ok(match deletion_vector { + Some(dv) => seq + .iter() + .enumerate() + .filter(|(offset, _)| !dv.contains(*offset as u32)) + .map(|(_, row_id)| row_id) + .collect(), + None => RowAddrTreeMap::from(seq), + }) +} + /// Build the [`OldIndexDataFilter`] that must be applied to existing index /// rows when their owning fragments have been pruned by compaction or /// deletions. @@ -198,6 +231,58 @@ async fn rebuild_scalar_segment( .await } +/// The index segments to rewrite in this optimize pass. +/// +/// Normally the trailing `num_indices_to_merge` segments. Under stable row ids, +/// any *older* segment that still covers a fragment carrying deletions is added +/// too: an update deletes a row's old copy (leaving a deletion vector) and +/// rewrites it under the same row id, so its stale old-value postings survive +/// until that segment is rewritten and filtered. Only the segments that actually +/// cover a deleted-from fragment are pulled in -- clean segments in between are +/// left untouched -- so an edit to old data does not force a full reindex. +/// +/// The deletion check is conservative (any current deletion vector on a covered +/// fragment), so a segment built after those deletions may be rewritten as a +/// harmless no-op; it never leaves a stale segment behind (PR #7359). +fn select_segments_to_merge<'a>( + dataset: &Dataset, + old_indices: &[&'a IndexMetadata], + options: &OptimizeOptions, +) -> Vec<&'a IndexMetadata> { + let num_to_merge = options + .num_indices_to_merge + .unwrap_or(1) + .min(old_indices.len()); + let tail_start = old_indices.len() - num_to_merge; + + // Address-style row ids mask stale postings at search time, and append mode + // (num_to_merge == 0) defers cleanup to a real merge; both keep the plain tail. + if num_to_merge == 0 || !dataset.manifest.uses_stable_row_ids() { + return old_indices[tail_start..].to_vec(); + } + + let deleted_frags: RoaringBitmap = dataset + .get_fragments() + .iter() + .filter(|f| f.metadata().deletion_file.is_some()) + .map(|f| f.id() as u32) + .collect(); + if deleted_frags.is_empty() { + return old_indices[tail_start..].to_vec(); + } + + let mut selected = Vec::new(); + for (i, idx) in old_indices.iter().enumerate() { + let covers_deleted = idx + .effective_fragment_bitmap(&dataset.fragment_bitmap) + .is_some_and(|eff| !eff.is_disjoint(&deleted_frags)); + if i >= tail_start || covers_deleted { + selected.push(*idx); + } + } + selected +} + #[allow(clippy::too_many_arguments)] async fn merge_scalar_indices<'a>( dataset: Arc, @@ -215,18 +300,13 @@ async fn merge_scalar_indices<'a>( )); } - let num_to_merge = options - .num_indices_to_merge - .unwrap_or(1) - .min(old_indices.len()); + let selected_old_indices = select_segments_to_merge(dataset.as_ref(), old_indices, options); // No new data + ≤1 old selected = rewriting one segment to itself. - if unindexed.is_empty() && num_to_merge <= 1 { + if unindexed.is_empty() && selected_old_indices.len() <= 1 { return Ok(None); } - let selected_old_indices = &old_indices[old_indices.len() - num_to_merge..]; - // For the delta case (`selected` empty) the reference is purely // for reading params; fall back to the last old index then. let reference_idx = selected_old_indices @@ -284,17 +364,20 @@ async fn merge_scalar_indices<'a>( match index_type { IndexType::BTree => { let (_, old_data_filters) = - build_per_segment_filters(dataset.as_ref(), selected_old_indices).await?; + build_per_segment_filters(dataset.as_ref(), &selected_old_indices).await?; crate::index::scalar::btree::open_and_merge_segments( dataset.as_ref(), field_path, - selected_old_indices, + &selected_old_indices, new_data_stream, &new_store, &old_data_filters, ) .await? } + // NOTE: IndexType::Inverted never reaches here -- it is handled by the + // dedicated arm in merge_indices_with_unindexed_frags before this + // function is called. _ => { let old_data_filter = build_old_data_filter( dataset.as_ref(), @@ -609,16 +692,11 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let index_type = indices[0].index_type(); match index_type { IndexType::Inverted => { - let num_to_merge = options - .num_indices_to_merge - .unwrap_or(1) - .min(old_indices.len()); - if unindexed.is_empty() && num_to_merge <= 1 { + let selected_old_indices = + select_segments_to_merge(dataset.as_ref(), old_indices, options); + if unindexed.is_empty() && selected_old_indices.len() <= 1 { return Ok(None); } - - let selected_start = old_indices.len().saturating_sub(num_to_merge); - let selected_old_indices = &old_indices[selected_start..]; let reference_idx = selected_old_indices .first() .copied() @@ -674,7 +752,7 @@ pub async fn merge_indices_with_unindexed_frags<'a>( let mut frag_bitmap = base_unindexed_bitmap; let mut effective_old_frags = RoaringBitmap::new(); let mut selected_indices = Vec::with_capacity(selected_old_indices.len()); - for idx in selected_old_indices { + for idx in &selected_old_indices { if let Some(effective) = idx.effective_fragment_bitmap(&dataset.fragment_bitmap) { frag_bitmap |= &effective; @@ -1922,6 +2000,417 @@ mod tests { assert_eq!(query_id_count(&dataset, "song-42").await, 1); } + /// Under stable row ids, updating an indexed column and then calling + /// `optimize_indices` must not leave stale entries (old value -> updated row) + /// in the scalar index. An update deletes the old copy of each row and + /// rewrites it under the same stable row id, so the old index entry is stale + /// and must be dropped on merge. Covers BTree, Bitmap, and Inverted (FTS), + /// which take three different merge paths. + #[tokio::test] + async fn test_optimize_scalar_index_drops_stale_rows_after_update() { + use crate::dataset::UpdateBuilder; + use arrow_array::Int32Array; + use lance_index::scalar::FullTextSearchQuery; + use lance_index::scalar::inverted::InvertedIndexParams; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + // 100 rows: num == id; cat = "A" for id<50 else "B"; body = "alpha" for + // id<50 else "beta". + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + Field::new("cat", DataType::Utf8, false), + Field::new("body", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "A" } else { "B" }), + )), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "alpha" } else { "beta" }), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["num"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + dataset + .create_index( + &["cat"], + IndexType::Bitmap, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + dataset + .create_index( + &["body"], + IndexType::Inverted, + None, + &InvertedIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Update the first 25 rows (id < 25): num -> -1, cat -> 'B', body -> 'beta'. + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .set("cat", "'B'") + .unwrap() + .set("body", "'beta'") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + // BTree: `num >= 0` matches ids 25..99 (75 rows); the 25 updated rows + // hold num = -1 and must not appear. + let btree_count = dataset + .scan() + .filter("num >= 0") + .unwrap() + .count_rows() + .await + .unwrap(); + assert_eq!(btree_count, 75, "btree returned stale/incorrect rows"); + + // Bitmap: only the 25 rows (ids 25..49) that still carry cat = 'A' match; + // the 25 rows updated to 'B' must not. + let bitmap_count = dataset + .scan() + .filter("cat = 'A'") + .unwrap() + .count_rows() + .await + .unwrap(); + assert_eq!(bitmap_count, 25, "bitmap returned stale rows"); + + // FTS: only the 25 rows (ids 25..49) whose body still reads "alpha" match; + // the 25 rows updated to "beta" must not. + let mut scan = dataset.scan(); + scan.full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + let fts_count = scan.count_rows().await.unwrap(); + assert_eq!(fts_count, 25, "FTS index returned stale rows"); + } + + /// Multi-segment variant (Jack Ye's repro, PR #7359): with one BTree segment + /// per fragment, default optimize merges only the tail segment. A stable-row-id + /// update to a row in an older segment's fragment must still drop that + /// segment's stale postings -- the merge has to reach back to cover it. + #[tokio::test] + async fn test_optimize_btree_drops_stale_rows_across_segments_after_update() { + use crate::dataset::UpdateBuilder; + use crate::index::CreateIndexBuilder; + use arrow_array::Int32Array; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + // Two fragments (0..49, 50..99) -> one BTree segment each. + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::BTree); + let fragments = dataset.get_fragments(); + let mut segments = Vec::new(); + for fragment in &fragments { + segments.push( + CreateIndexBuilder::new(&mut dataset, &["num"], IndexType::BTree, ¶ms) + .name("num_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("num_idx", "num", segments) + .await + .unwrap(); + + // Update the first 25 rows (in the first/older segment's fragment). + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + assert_eq!( + dataset + .scan() + .filter("num = 0") + .unwrap() + .count_rows() + .await + .unwrap(), + 0, + "stale entry leaked from the older, unmerged segment" + ); + assert_eq!( + dataset + .scan() + .filter("num >= 0") + .unwrap() + .count_rows() + .await + .unwrap(), + 75 + ); + } + + /// Same multi-segment gap for FTS, which takes the separate Inverted dispatch + /// path. One Inverted segment per fragment; an update to the older segment's + /// fragment must not leave its old-token postings searchable. + #[tokio::test] + async fn test_optimize_fts_drops_stale_rows_across_segments_after_update() { + use crate::dataset::UpdateBuilder; + use crate::index::CreateIndexBuilder; + use arrow_array::Int32Array; + use lance_index::scalar::FullTextSearchQuery; + use lance_index::scalar::inverted::InvertedIndexParams; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("body", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(StringArray::from_iter_values( + (0..100).map(|i| if i < 50 { "alpha" } else { "beta" }), + )), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + max_rows_per_file: 50, + ..Default::default() + }), + ) + .await + .unwrap(); + + let params = InvertedIndexParams::default(); + let fragments = dataset.get_fragments(); + let mut segments = Vec::new(); + for fragment in &fragments { + segments.push( + CreateIndexBuilder::new(&mut dataset, &["body"], IndexType::Inverted, ¶ms) + .name("body_idx".to_string()) + .fragments(vec![fragment.id() as u32]) + .execute_uncommitted() + .await + .unwrap(), + ); + } + dataset + .commit_existing_index_segments("body_idx", "body", segments) + .await + .unwrap(); + + // Update the first 25 rows (older segment's fragment): body -> "beta". + let res = UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("body", "'beta'") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + dataset = res.new_dataset.as_ref().clone(); + + dataset + .optimize_indices(&OptimizeOptions::default()) + .await + .unwrap(); + let dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + + let mut scan = dataset.scan(); + scan.full_text_search(FullTextSearchQuery::new("alpha".to_owned())) + .unwrap(); + assert_eq!( + scan.count_rows().await.unwrap(), + 25, + "FTS stale rows leaked from the older, unmerged segment" + ); + } + + /// `optimize_indices` builds the stable-row-id allow-list by subtracting each + /// fragment's deletion vector. If a deletion vector cannot be read, the merge + /// must fail loudly: swallowing the error (treating the load as "no + /// deletions") would put every deleted row back into the allow-list and + /// silently reintroduce the stale entries this fix removes. Simulate an + /// unreadable deletion vector by deleting the file the manifest still + /// references, then assert optimize errors instead of succeeding. + #[tokio::test] + async fn test_optimize_errors_when_deletion_vector_unreadable() { + use crate::dataset::UpdateBuilder; + use arrow_array::Int32Array; + use lance_table::io::deletion::deletion_file_path; + + let test_dir = TempStrDir::default(); + let test_uri = test_dir.as_str(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("num", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let mut dataset = Dataset::write( + reader, + test_uri, + Some(WriteParams { + enable_stable_row_ids: true, + ..Default::default() + }), + ) + .await + .unwrap(); + + dataset + .create_index( + &["num"], + IndexType::BTree, + None, + &ScalarIndexParams::default(), + true, + ) + .await + .unwrap(); + + // Update rewrites the first 25 rows under the same stable row ids, + // leaving a deletion vector on the original fragment. + UpdateBuilder::new(Arc::new(dataset.clone())) + .update_where("id < 25") + .unwrap() + .set("num", "-1") + .unwrap() + .build() + .unwrap() + .execute() + .await + .unwrap(); + + // Reload cold (nothing has cached the deletion vector), then remove the + // deletion file the manifest still references so the next read fails. + let mut dataset = DatasetBuilder::from_uri(test_uri).load().await.unwrap(); + let mut removed = 0; + for fragment in dataset.get_fragments() { + if let Some(deletion_file) = fragment.metadata().deletion_file.clone() { + let path = + deletion_file_path(&dataset.base, fragment.metadata().id, &deletion_file); + dataset.object_store.delete(&path).await.unwrap(); + removed += 1; + } + } + assert_eq!( + removed, 1, + "update should have left exactly one deletion file" + ); + + let result = dataset.optimize_indices(&OptimizeOptions::default()).await; + assert!( + result.is_err(), + "optimize must fail when a deletion vector cannot be read, not \ + silently keep the deleted rows in the index" + ); + } + #[tokio::test] async fn test_optimize_scalar_no_unindexed_fragments() { let test_dir = TempStrDir::default(); From 52666294292e4419b8dd11a653d07d55cb8db6cb Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Wed, 24 Jun 2026 01:52:18 +0800 Subject: [PATCH 177/177] docs: clarify local LTO guidance (#7400) This clarifies the repository agent instructions for Rust local workflows. Local development, debugging, and performance testing should avoid LTO so those workflows do not accidentally pay release-artifact build costs or benchmark a build mode that was not explicitly requested. LTO remains available when release artifact validation explicitly asks for it. --- .cargo/config.toml | 7 +++++++ AGENTS.md | 3 +++ 2 files changed, 10 insertions(+) diff --git a/.cargo/config.toml b/.cargo/config.toml index 1d9c9ecc9da..c455c4a978d 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,6 +9,13 @@ debug = true codegen-units = 16 lto = "thin" +[profile.release-no-lto] +inherits = "release" +debug = true +lto = false +# Prioritize compile time when LTO is not relevant to the measurement. +codegen-units = 16 + [profile.bench] inherits = "release" lto = "thin" diff --git a/AGENTS.md b/AGENTS.md index 2003d6dba10..30c0abea1a7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,6 +41,9 @@ Key technical traits: async-first (tokio), Arrow-native, versioned writes with m * Coverage: `cargo +nightly llvm-cov -q -p --branch` * Coverage HTML: `cargo +nightly llvm-cov -q -p --branch --html` * Coverage for file: `python ci/coverage.py -p -f ` +* Use repository-defined Cargo profiles instead of ad hoc LTO overrides. +* Use `release-with-debug` for benchmarks and profiling so optimized builds keep debug symbols without a rebuild. +* Use `release-no-lto` only for local debugging, IO-bound benchmarks, or compile-time-sensitive performance investigation where LTO would not affect the measured bottleneck. ### Python / Java